VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 104195

Last change on this file since 104195 was 104195, checked in by vboxsync, 8 months ago

VMM/IEM: Refactoring assembly helpers to not pass eflags by reference but instead by value and return the updated value (via eax/w0) - first chunk: ADD,ADC,SUB,SBB,CMP,TEST,AND,OR,XOR. bugref:10376

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 726.5 KB
Line 
1/* $Id: IEMAllAImplC.cpp 104195 2024-04-05 14:45:23Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2024 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.virtualbox.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Calculates the parity flag.
87 *
88 * @returns X86_EFL_PF or 0.
89 * @param a_uResult Unsigned result value.
90 */
91#if !defined(RT_ARCH_ARM64) || 1 /** @todo profile this... micro benching in tstIEMAImpl indicates no gain, but it may be skewed. */
92# define IEM_EFL_CALC_PARITY(a_uResult) (g_afParity[(a_uResult) & 0xff])
93#else
94# define IEM_EFL_CALC_PARITY(a_uResult) iemAImplCalcParity(a_uResult)
95DECL_FORCE_INLINE(uint32_t) iemAImplCalcParity(uint32_t uResult)
96{
97 /* Emulate 8-bit pop count. This translates to 4 EOR instructions on
98 ARM64 as they can shift the 2nd source operand. */
99 uint8_t bPf = uResult ^ (uResult >> 4);
100 bPf ^= bPf >> 2;
101 bPf ^= bPf >> 1;
102 bPf ^= 1;
103 return (bPf & 1) << X86_EFL_PF_BIT;
104}
105#endif
106
107/**
108 * Extracts the OF flag from a OF calculation result.
109 *
110 * These are typically used by concating with a bitcount. The problem is that
111 * 8-bit values needs shifting in the other direction than the others.
112 */
113#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
114#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
115#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
116#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
117
118/**
119 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
120 *
121 * @returns Status bits.
122 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
123 * @param a_uResult Unsigned result value.
124 * @param a_uSrc The source value (for AF calc).
125 * @param a_uDst The original destination value (for AF+OF calc).
126 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
127 * @param a_CfExpr Bool expression for the carry flag (CF).
128 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
129 */
130#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_fEFlagsVar, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
131 do { \
132 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
133 a_fEFlagsVar |= (a_CfExpr) << X86_EFL_CF_BIT; \
134 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
135 a_fEFlagsVar |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
136 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
137 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
138 \
139 /* Overflow during ADDition happens when both inputs have the same signed \
140 bit value and the result has a different sign bit value. \
141 \
142 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
143 follows that for SUBtraction the signed bit value must differ between \
144 the two inputs and the result's signed bit diff from the first input. \
145 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
146 \
147 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
148 a_fEFlagsVar |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
149 & RT_BIT_64(a_cBitsWidth - 1)) \
150 & ((a_uResult) ^ (a_uDst)) ); \
151 } while (0)
152
153/**
154 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
155 *
156 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
157 * undefined. We clear AF, as that seems to make the most sense and also seems
158 * to be the correct behavior on current CPUs.
159 *
160 * @returns Status bits.
161 * @param a_fEFlagsVar The 32-bit EFLAGS variable to update.
162 * @param a_uResult Unsigned result value.
163 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
164 * @param a_fExtra Additional bits to set.
165 */
166#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(a_fEFlagsVar, a_uResult, a_cBitsWidth, a_fExtra) \
167 do { \
168 a_fEFlagsVar &= ~X86_EFL_STATUS_BITS; \
169 a_fEFlagsVar |= IEM_EFL_CALC_PARITY(a_uResult); \
170 a_fEFlagsVar |= X86_EFL_CALC_ZF(a_uResult); \
171 a_fEFlagsVar |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
172 a_fEFlagsVar |= (a_fExtra); \
173 } while (0)
174
175
176/*********************************************************************************************************************************
177* Global Variables *
178*********************************************************************************************************************************/
179/**
180 * Parity calculation table.
181 *
182 * This is also used by iemAllAImpl.asm.
183 *
184 * The generator code:
185 * @code
186 * #include <stdio.h>
187 *
188 * int main()
189 * {
190 * unsigned b;
191 * for (b = 0; b < 256; b++)
192 * {
193 * int cOnes = ( b & 1)
194 * + ((b >> 1) & 1)
195 * + ((b >> 2) & 1)
196 * + ((b >> 3) & 1)
197 * + ((b >> 4) & 1)
198 * + ((b >> 5) & 1)
199 * + ((b >> 6) & 1)
200 * + ((b >> 7) & 1);
201 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
202 * b,
203 * (b >> 7) & 1,
204 * (b >> 6) & 1,
205 * (b >> 5) & 1,
206 * (b >> 4) & 1,
207 * (b >> 3) & 1,
208 * (b >> 2) & 1,
209 * (b >> 1) & 1,
210 * b & 1,
211 * cOnes & 1 ? "0" : "X86_EFL_PF");
212 * }
213 * return 0;
214 * }
215 * @endcode
216 */
217uint8_t const g_afParity[256] =
218{
219 /* 0000 = 00000000b */ X86_EFL_PF,
220 /* 0x01 = 00000001b */ 0,
221 /* 0x02 = 00000010b */ 0,
222 /* 0x03 = 00000011b */ X86_EFL_PF,
223 /* 0x04 = 00000100b */ 0,
224 /* 0x05 = 00000101b */ X86_EFL_PF,
225 /* 0x06 = 00000110b */ X86_EFL_PF,
226 /* 0x07 = 00000111b */ 0,
227 /* 0x08 = 00001000b */ 0,
228 /* 0x09 = 00001001b */ X86_EFL_PF,
229 /* 0x0a = 00001010b */ X86_EFL_PF,
230 /* 0x0b = 00001011b */ 0,
231 /* 0x0c = 00001100b */ X86_EFL_PF,
232 /* 0x0d = 00001101b */ 0,
233 /* 0x0e = 00001110b */ 0,
234 /* 0x0f = 00001111b */ X86_EFL_PF,
235 /* 0x10 = 00010000b */ 0,
236 /* 0x11 = 00010001b */ X86_EFL_PF,
237 /* 0x12 = 00010010b */ X86_EFL_PF,
238 /* 0x13 = 00010011b */ 0,
239 /* 0x14 = 00010100b */ X86_EFL_PF,
240 /* 0x15 = 00010101b */ 0,
241 /* 0x16 = 00010110b */ 0,
242 /* 0x17 = 00010111b */ X86_EFL_PF,
243 /* 0x18 = 00011000b */ X86_EFL_PF,
244 /* 0x19 = 00011001b */ 0,
245 /* 0x1a = 00011010b */ 0,
246 /* 0x1b = 00011011b */ X86_EFL_PF,
247 /* 0x1c = 00011100b */ 0,
248 /* 0x1d = 00011101b */ X86_EFL_PF,
249 /* 0x1e = 00011110b */ X86_EFL_PF,
250 /* 0x1f = 00011111b */ 0,
251 /* 0x20 = 00100000b */ 0,
252 /* 0x21 = 00100001b */ X86_EFL_PF,
253 /* 0x22 = 00100010b */ X86_EFL_PF,
254 /* 0x23 = 00100011b */ 0,
255 /* 0x24 = 00100100b */ X86_EFL_PF,
256 /* 0x25 = 00100101b */ 0,
257 /* 0x26 = 00100110b */ 0,
258 /* 0x27 = 00100111b */ X86_EFL_PF,
259 /* 0x28 = 00101000b */ X86_EFL_PF,
260 /* 0x29 = 00101001b */ 0,
261 /* 0x2a = 00101010b */ 0,
262 /* 0x2b = 00101011b */ X86_EFL_PF,
263 /* 0x2c = 00101100b */ 0,
264 /* 0x2d = 00101101b */ X86_EFL_PF,
265 /* 0x2e = 00101110b */ X86_EFL_PF,
266 /* 0x2f = 00101111b */ 0,
267 /* 0x30 = 00110000b */ X86_EFL_PF,
268 /* 0x31 = 00110001b */ 0,
269 /* 0x32 = 00110010b */ 0,
270 /* 0x33 = 00110011b */ X86_EFL_PF,
271 /* 0x34 = 00110100b */ 0,
272 /* 0x35 = 00110101b */ X86_EFL_PF,
273 /* 0x36 = 00110110b */ X86_EFL_PF,
274 /* 0x37 = 00110111b */ 0,
275 /* 0x38 = 00111000b */ 0,
276 /* 0x39 = 00111001b */ X86_EFL_PF,
277 /* 0x3a = 00111010b */ X86_EFL_PF,
278 /* 0x3b = 00111011b */ 0,
279 /* 0x3c = 00111100b */ X86_EFL_PF,
280 /* 0x3d = 00111101b */ 0,
281 /* 0x3e = 00111110b */ 0,
282 /* 0x3f = 00111111b */ X86_EFL_PF,
283 /* 0x40 = 01000000b */ 0,
284 /* 0x41 = 01000001b */ X86_EFL_PF,
285 /* 0x42 = 01000010b */ X86_EFL_PF,
286 /* 0x43 = 01000011b */ 0,
287 /* 0x44 = 01000100b */ X86_EFL_PF,
288 /* 0x45 = 01000101b */ 0,
289 /* 0x46 = 01000110b */ 0,
290 /* 0x47 = 01000111b */ X86_EFL_PF,
291 /* 0x48 = 01001000b */ X86_EFL_PF,
292 /* 0x49 = 01001001b */ 0,
293 /* 0x4a = 01001010b */ 0,
294 /* 0x4b = 01001011b */ X86_EFL_PF,
295 /* 0x4c = 01001100b */ 0,
296 /* 0x4d = 01001101b */ X86_EFL_PF,
297 /* 0x4e = 01001110b */ X86_EFL_PF,
298 /* 0x4f = 01001111b */ 0,
299 /* 0x50 = 01010000b */ X86_EFL_PF,
300 /* 0x51 = 01010001b */ 0,
301 /* 0x52 = 01010010b */ 0,
302 /* 0x53 = 01010011b */ X86_EFL_PF,
303 /* 0x54 = 01010100b */ 0,
304 /* 0x55 = 01010101b */ X86_EFL_PF,
305 /* 0x56 = 01010110b */ X86_EFL_PF,
306 /* 0x57 = 01010111b */ 0,
307 /* 0x58 = 01011000b */ 0,
308 /* 0x59 = 01011001b */ X86_EFL_PF,
309 /* 0x5a = 01011010b */ X86_EFL_PF,
310 /* 0x5b = 01011011b */ 0,
311 /* 0x5c = 01011100b */ X86_EFL_PF,
312 /* 0x5d = 01011101b */ 0,
313 /* 0x5e = 01011110b */ 0,
314 /* 0x5f = 01011111b */ X86_EFL_PF,
315 /* 0x60 = 01100000b */ X86_EFL_PF,
316 /* 0x61 = 01100001b */ 0,
317 /* 0x62 = 01100010b */ 0,
318 /* 0x63 = 01100011b */ X86_EFL_PF,
319 /* 0x64 = 01100100b */ 0,
320 /* 0x65 = 01100101b */ X86_EFL_PF,
321 /* 0x66 = 01100110b */ X86_EFL_PF,
322 /* 0x67 = 01100111b */ 0,
323 /* 0x68 = 01101000b */ 0,
324 /* 0x69 = 01101001b */ X86_EFL_PF,
325 /* 0x6a = 01101010b */ X86_EFL_PF,
326 /* 0x6b = 01101011b */ 0,
327 /* 0x6c = 01101100b */ X86_EFL_PF,
328 /* 0x6d = 01101101b */ 0,
329 /* 0x6e = 01101110b */ 0,
330 /* 0x6f = 01101111b */ X86_EFL_PF,
331 /* 0x70 = 01110000b */ 0,
332 /* 0x71 = 01110001b */ X86_EFL_PF,
333 /* 0x72 = 01110010b */ X86_EFL_PF,
334 /* 0x73 = 01110011b */ 0,
335 /* 0x74 = 01110100b */ X86_EFL_PF,
336 /* 0x75 = 01110101b */ 0,
337 /* 0x76 = 01110110b */ 0,
338 /* 0x77 = 01110111b */ X86_EFL_PF,
339 /* 0x78 = 01111000b */ X86_EFL_PF,
340 /* 0x79 = 01111001b */ 0,
341 /* 0x7a = 01111010b */ 0,
342 /* 0x7b = 01111011b */ X86_EFL_PF,
343 /* 0x7c = 01111100b */ 0,
344 /* 0x7d = 01111101b */ X86_EFL_PF,
345 /* 0x7e = 01111110b */ X86_EFL_PF,
346 /* 0x7f = 01111111b */ 0,
347 /* 0x80 = 10000000b */ 0,
348 /* 0x81 = 10000001b */ X86_EFL_PF,
349 /* 0x82 = 10000010b */ X86_EFL_PF,
350 /* 0x83 = 10000011b */ 0,
351 /* 0x84 = 10000100b */ X86_EFL_PF,
352 /* 0x85 = 10000101b */ 0,
353 /* 0x86 = 10000110b */ 0,
354 /* 0x87 = 10000111b */ X86_EFL_PF,
355 /* 0x88 = 10001000b */ X86_EFL_PF,
356 /* 0x89 = 10001001b */ 0,
357 /* 0x8a = 10001010b */ 0,
358 /* 0x8b = 10001011b */ X86_EFL_PF,
359 /* 0x8c = 10001100b */ 0,
360 /* 0x8d = 10001101b */ X86_EFL_PF,
361 /* 0x8e = 10001110b */ X86_EFL_PF,
362 /* 0x8f = 10001111b */ 0,
363 /* 0x90 = 10010000b */ X86_EFL_PF,
364 /* 0x91 = 10010001b */ 0,
365 /* 0x92 = 10010010b */ 0,
366 /* 0x93 = 10010011b */ X86_EFL_PF,
367 /* 0x94 = 10010100b */ 0,
368 /* 0x95 = 10010101b */ X86_EFL_PF,
369 /* 0x96 = 10010110b */ X86_EFL_PF,
370 /* 0x97 = 10010111b */ 0,
371 /* 0x98 = 10011000b */ 0,
372 /* 0x99 = 10011001b */ X86_EFL_PF,
373 /* 0x9a = 10011010b */ X86_EFL_PF,
374 /* 0x9b = 10011011b */ 0,
375 /* 0x9c = 10011100b */ X86_EFL_PF,
376 /* 0x9d = 10011101b */ 0,
377 /* 0x9e = 10011110b */ 0,
378 /* 0x9f = 10011111b */ X86_EFL_PF,
379 /* 0xa0 = 10100000b */ X86_EFL_PF,
380 /* 0xa1 = 10100001b */ 0,
381 /* 0xa2 = 10100010b */ 0,
382 /* 0xa3 = 10100011b */ X86_EFL_PF,
383 /* 0xa4 = 10100100b */ 0,
384 /* 0xa5 = 10100101b */ X86_EFL_PF,
385 /* 0xa6 = 10100110b */ X86_EFL_PF,
386 /* 0xa7 = 10100111b */ 0,
387 /* 0xa8 = 10101000b */ 0,
388 /* 0xa9 = 10101001b */ X86_EFL_PF,
389 /* 0xaa = 10101010b */ X86_EFL_PF,
390 /* 0xab = 10101011b */ 0,
391 /* 0xac = 10101100b */ X86_EFL_PF,
392 /* 0xad = 10101101b */ 0,
393 /* 0xae = 10101110b */ 0,
394 /* 0xaf = 10101111b */ X86_EFL_PF,
395 /* 0xb0 = 10110000b */ 0,
396 /* 0xb1 = 10110001b */ X86_EFL_PF,
397 /* 0xb2 = 10110010b */ X86_EFL_PF,
398 /* 0xb3 = 10110011b */ 0,
399 /* 0xb4 = 10110100b */ X86_EFL_PF,
400 /* 0xb5 = 10110101b */ 0,
401 /* 0xb6 = 10110110b */ 0,
402 /* 0xb7 = 10110111b */ X86_EFL_PF,
403 /* 0xb8 = 10111000b */ X86_EFL_PF,
404 /* 0xb9 = 10111001b */ 0,
405 /* 0xba = 10111010b */ 0,
406 /* 0xbb = 10111011b */ X86_EFL_PF,
407 /* 0xbc = 10111100b */ 0,
408 /* 0xbd = 10111101b */ X86_EFL_PF,
409 /* 0xbe = 10111110b */ X86_EFL_PF,
410 /* 0xbf = 10111111b */ 0,
411 /* 0xc0 = 11000000b */ X86_EFL_PF,
412 /* 0xc1 = 11000001b */ 0,
413 /* 0xc2 = 11000010b */ 0,
414 /* 0xc3 = 11000011b */ X86_EFL_PF,
415 /* 0xc4 = 11000100b */ 0,
416 /* 0xc5 = 11000101b */ X86_EFL_PF,
417 /* 0xc6 = 11000110b */ X86_EFL_PF,
418 /* 0xc7 = 11000111b */ 0,
419 /* 0xc8 = 11001000b */ 0,
420 /* 0xc9 = 11001001b */ X86_EFL_PF,
421 /* 0xca = 11001010b */ X86_EFL_PF,
422 /* 0xcb = 11001011b */ 0,
423 /* 0xcc = 11001100b */ X86_EFL_PF,
424 /* 0xcd = 11001101b */ 0,
425 /* 0xce = 11001110b */ 0,
426 /* 0xcf = 11001111b */ X86_EFL_PF,
427 /* 0xd0 = 11010000b */ 0,
428 /* 0xd1 = 11010001b */ X86_EFL_PF,
429 /* 0xd2 = 11010010b */ X86_EFL_PF,
430 /* 0xd3 = 11010011b */ 0,
431 /* 0xd4 = 11010100b */ X86_EFL_PF,
432 /* 0xd5 = 11010101b */ 0,
433 /* 0xd6 = 11010110b */ 0,
434 /* 0xd7 = 11010111b */ X86_EFL_PF,
435 /* 0xd8 = 11011000b */ X86_EFL_PF,
436 /* 0xd9 = 11011001b */ 0,
437 /* 0xda = 11011010b */ 0,
438 /* 0xdb = 11011011b */ X86_EFL_PF,
439 /* 0xdc = 11011100b */ 0,
440 /* 0xdd = 11011101b */ X86_EFL_PF,
441 /* 0xde = 11011110b */ X86_EFL_PF,
442 /* 0xdf = 11011111b */ 0,
443 /* 0xe0 = 11100000b */ 0,
444 /* 0xe1 = 11100001b */ X86_EFL_PF,
445 /* 0xe2 = 11100010b */ X86_EFL_PF,
446 /* 0xe3 = 11100011b */ 0,
447 /* 0xe4 = 11100100b */ X86_EFL_PF,
448 /* 0xe5 = 11100101b */ 0,
449 /* 0xe6 = 11100110b */ 0,
450 /* 0xe7 = 11100111b */ X86_EFL_PF,
451 /* 0xe8 = 11101000b */ X86_EFL_PF,
452 /* 0xe9 = 11101001b */ 0,
453 /* 0xea = 11101010b */ 0,
454 /* 0xeb = 11101011b */ X86_EFL_PF,
455 /* 0xec = 11101100b */ 0,
456 /* 0xed = 11101101b */ X86_EFL_PF,
457 /* 0xee = 11101110b */ X86_EFL_PF,
458 /* 0xef = 11101111b */ 0,
459 /* 0xf0 = 11110000b */ X86_EFL_PF,
460 /* 0xf1 = 11110001b */ 0,
461 /* 0xf2 = 11110010b */ 0,
462 /* 0xf3 = 11110011b */ X86_EFL_PF,
463 /* 0xf4 = 11110100b */ 0,
464 /* 0xf5 = 11110101b */ X86_EFL_PF,
465 /* 0xf6 = 11110110b */ X86_EFL_PF,
466 /* 0xf7 = 11110111b */ 0,
467 /* 0xf8 = 11111000b */ 0,
468 /* 0xf9 = 11111001b */ X86_EFL_PF,
469 /* 0xfa = 11111010b */ X86_EFL_PF,
470 /* 0xfb = 11111011b */ 0,
471 /* 0xfc = 11111100b */ X86_EFL_PF,
472 /* 0xfd = 11111101b */ 0,
473 /* 0xfe = 11111110b */ 0,
474 /* 0xff = 11111111b */ X86_EFL_PF,
475};
476
477/* for clang: */
478extern const RTFLOAT32U g_ar32Zero[];
479extern const RTFLOAT64U g_ar64Zero[];
480extern const RTFLOAT80U g_ar80Zero[];
481extern const RTFLOAT32U g_ar32One[];
482extern const RTFLOAT80U g_ar80One[];
483extern const RTFLOAT80U g_r80Indefinite;
484extern const RTFLOAT32U g_ar32Infinity[];
485extern const RTFLOAT64U g_ar64Infinity[];
486extern const RTFLOAT80U g_ar80Infinity[];
487extern const RTFLOAT128U g_r128Ln2;
488extern const RTUINT128U g_u128Ln2Mantissa;
489extern const RTUINT128U g_u128Ln2MantissaIntel;
490extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
491extern const RTFLOAT32U g_ar32QNaN[];
492extern const RTFLOAT64U g_ar64QNaN[];
493
494/** Zero values (indexed by fSign). */
495RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
496RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
497RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
498
499/** One values (indexed by fSign). */
500RTFLOAT32U const g_ar32One[] =
501{ RTFLOAT32U_INIT(0, 0, RTFLOAT32U_EXP_BIAS), RTFLOAT32U_INIT(1, 0, RTFLOAT32U_EXP_BIAS) };
502RTFLOAT80U const g_ar80One[] =
503{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
504
505/** Indefinite (negative). */
506RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
507
508/** Infinities (indexed by fSign). */
509RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
510RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
511RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
512
513/** Default QNaNs (indexed by fSign). */
514RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
515RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
516
517
518#if 0
519/** 128-bit floating point constant: 2.0 */
520const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
521#endif
522
523
524/* The next section is generated by tools/IEMGenFpuConstants: */
525
526/** The ln2 constant as 128-bit floating point value.
527 * base-10: 6.93147180559945309417232121458176575e-1
528 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
529 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
530//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
531const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
532/** High precision ln2 value.
533 * base-10: 6.931471805599453094172321214581765680747e-1
534 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
535 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
536const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
537/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
538 * base-10: 6.931471805599453094151379470289064954613e-1
539 * base-16: b.17217f7d1cf79abc0000000000000000@-1
540 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
541const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
542
543/** Horner constants for f2xm1 */
544const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
545{
546 /* a0
547 * base-10: 1.00000000000000000000000000000000000e0
548 * base-16: 1.0000000000000000000000000000@0
549 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
550 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
551 /* a1
552 * base-10: 5.00000000000000000000000000000000000e-1
553 * base-16: 8.0000000000000000000000000000@-1
554 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
555 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
556 /* a2
557 * base-10: 1.66666666666666666666666666666666658e-1
558 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
559 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
560 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
561 /* a3
562 * base-10: 4.16666666666666666666666666666666646e-2
563 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
564 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
565 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
566 /* a4
567 * base-10: 8.33333333333333333333333333333333323e-3
568 * base-16: 2.2222222222222222222222222222@-2
569 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
570 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
571 /* a5
572 * base-10: 1.38888888888888888888888888888888874e-3
573 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
574 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
575 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
576 /* a6
577 * base-10: 1.98412698412698412698412698412698412e-4
578 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
579 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
580 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
581 /* a7
582 * base-10: 2.48015873015873015873015873015873015e-5
583 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
584 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
585 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
586 /* a8
587 * base-10: 2.75573192239858906525573192239858902e-6
588 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
589 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
590 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
591 /* a9
592 * base-10: 2.75573192239858906525573192239858865e-7
593 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
594 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
595 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
596 /* a10
597 * base-10: 2.50521083854417187750521083854417184e-8
598 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
599 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
600 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
601 /* a11
602 * base-10: 2.08767569878680989792100903212014296e-9
603 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
604 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
605 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
606 /* a12
607 * base-10: 1.60590438368216145993923771701549472e-10
608 * base-16: b.092309d43684be51c198e91d7b40@-9
609 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
610 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
611 /* a13
612 * base-10: 1.14707455977297247138516979786821043e-11
613 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
614 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
615 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
616 /* a14
617 * base-10: 7.64716373181981647590113198578806964e-13
618 * base-16: d.73f9f399dc0f88ec32b587746578@-11
619 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
620 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
621 /* a15
622 * base-10: 4.77947733238738529743820749111754352e-14
623 * base-16: d.73f9f399dc0f88ec32b587746578@-12
624 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
625 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
626 /* a16
627 * base-10: 2.81145725434552076319894558301031970e-15
628 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
629 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
630 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
631 /* a17
632 * base-10: 1.56192069685862264622163643500573321e-16
633 * base-16: b.413c31dcbecbbdd8024435161550@-14
634 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
635 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
636 /* a18
637 * base-10: 8.22063524662432971695598123687227980e-18
638 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
639 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
640 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
641 /* a19
642 * base-10: 4.11031762331216485847799061843614006e-19
643 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
644 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
645 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
646 /* a20
647 * base-10: 1.95729410633912612308475743735054143e-20
648 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
649 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
650 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
651 /* a21
652 * base-10: 8.89679139245057328674889744250246106e-22
653 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
654 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
655 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
656};
657
658
659/*
660 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
661 * it all in C is probably safer atm., optimize what's necessary later, maybe.
662 */
663#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
664
665
666/*********************************************************************************************************************************
667* Binary Operations *
668*********************************************************************************************************************************/
669
670/*
671 * ADD
672 */
673
674IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
675{
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
680 return fEFlags;
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
686{
687 uint32_t uDst = *puDst;
688 uint32_t uResult = uDst + uSrc;
689 *puDst = uResult;
690 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
691 return fEFlags;
692}
693
694
695IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
696{
697 uint16_t uDst = *puDst;
698 uint16_t uResult = uDst + uSrc;
699 *puDst = uResult;
700 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
701 return fEFlags;
702}
703
704
705IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_add_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
706{
707 uint8_t uDst = *puDst;
708 uint8_t uResult = uDst + uSrc;
709 *puDst = uResult;
710 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
711 return fEFlags;
712}
713
714# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
715
716/*
717 * ADC
718 */
719
720IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
721{
722 if (!(fEFlags & X86_EFL_CF))
723 fEFlags = iemAImpl_add_u64(fEFlags, puDst, uSrc);
724 else
725 {
726 uint64_t uDst = *puDst;
727 uint64_t uResult = uDst + uSrc + 1;
728 *puDst = uResult;
729 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
730 }
731 return fEFlags;
732}
733
734# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
735
736IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
737{
738 if (!(fEFlags & X86_EFL_CF))
739 fEFlags = iemAImpl_add_u32(fEFlags, puDst, uSrc);
740 else
741 {
742 uint32_t uDst = *puDst;
743 uint32_t uResult = uDst + uSrc + 1;
744 *puDst = uResult;
745 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
746 }
747 return fEFlags;
748}
749
750
751IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
752{
753 if (!(fEFlags & X86_EFL_CF))
754 fEFlags = iemAImpl_add_u16(fEFlags, puDst, uSrc);
755 else
756 {
757 uint16_t uDst = *puDst;
758 uint16_t uResult = uDst + uSrc + 1;
759 *puDst = uResult;
760 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
761 }
762 return fEFlags;
763}
764
765
766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_adc_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
767{
768 if (!(fEFlags & X86_EFL_CF))
769 fEFlags = iemAImpl_add_u8(fEFlags, puDst, uSrc);
770 else
771 {
772 uint8_t uDst = *puDst;
773 uint8_t uResult = uDst + uSrc + 1;
774 *puDst = uResult;
775 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
776 }
777 return fEFlags;
778}
779
780# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
781
782/*
783 * SUB
784 */
785# if !defined(RT_ARCH_ARM64)
786
787IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
788{
789 uint64_t uDst = *puDst;
790 uint64_t uResult = uDst - uSrc;
791 *puDst = uResult;
792 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
793 return fEFlags;
794}
795
796# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
797
798IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
799{
800 uint32_t uDst = *puDst;
801 uint32_t uResult = uDst - uSrc;
802 *puDst = uResult;
803 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
804 return fEFlags;
805}
806
807
808IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
809{
810 uint16_t uDst = *puDst;
811 uint16_t uResult = uDst - uSrc;
812 *puDst = uResult;
813 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
814 return fEFlags;
815}
816
817
818IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sub_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
819{
820 uint8_t uDst = *puDst;
821 uint8_t uResult = uDst - uSrc;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
824 return fEFlags;
825}
826
827# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
828# endif /* !RT_ARCH_ARM64 */
829
830/*
831 * SBB
832 */
833
834IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
835{
836 if (!(fEFlags & X86_EFL_CF))
837 fEFlags = iemAImpl_sub_u64(fEFlags, puDst, uSrc);
838 else
839 {
840 uint64_t uDst = *puDst;
841 uint64_t uResult = uDst - uSrc - 1;
842 *puDst = uResult;
843 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
844 }
845 return fEFlags;
846}
847
848# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
849
850IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
851{
852 if (!(fEFlags & X86_EFL_CF))
853 fEFlags = iemAImpl_sub_u32(fEFlags, puDst, uSrc);
854 else
855 {
856 uint32_t uDst = *puDst;
857 uint32_t uResult = uDst - uSrc - 1;
858 *puDst = uResult;
859 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
860 }
861 return fEFlags;
862}
863
864
865IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
866{
867 if (!(fEFlags & X86_EFL_CF))
868 fEFlags = iemAImpl_sub_u16(fEFlags, puDst, uSrc);
869 else
870 {
871 uint16_t uDst = *puDst;
872 uint16_t uResult = uDst - uSrc - 1;
873 *puDst = uResult;
874 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
875 }
876 return fEFlags;
877}
878
879
880IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sbb_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
881{
882 if (!(fEFlags & X86_EFL_CF))
883 fEFlags = iemAImpl_sub_u8(fEFlags, puDst, uSrc);
884 else
885 {
886 uint8_t uDst = *puDst;
887 uint8_t uResult = uDst - uSrc - 1;
888 *puDst = uResult;
889 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(fEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
890 }
891 return fEFlags;
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896
897/*
898 * OR
899 */
900
901IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
902{
903 uint64_t uResult = *puDst | uSrc;
904 *puDst = uResult;
905 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
906 return fEFlags;
907}
908
909# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
910
911IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
912{
913 uint32_t uResult = *puDst | uSrc;
914 *puDst = uResult;
915 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
916 return fEFlags;
917}
918
919
920IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
921{
922 uint16_t uResult = *puDst | uSrc;
923 *puDst = uResult;
924 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
925 return fEFlags;
926}
927
928
929IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_or_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
930{
931 uint8_t uResult = *puDst | uSrc;
932 *puDst = uResult;
933 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
934 return fEFlags;
935}
936
937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
938
939/*
940 * XOR
941 */
942
943IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
944{
945 uint64_t uResult = *puDst ^ uSrc;
946 *puDst = uResult;
947 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
948 return fEFlags;
949}
950
951# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
952
953IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
954{
955 uint32_t uResult = *puDst ^ uSrc;
956 *puDst = uResult;
957 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
958 return fEFlags;
959}
960
961
962IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
963{
964 uint16_t uResult = *puDst ^ uSrc;
965 *puDst = uResult;
966 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
967 return fEFlags;
968}
969
970
971IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_xor_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
972{
973 uint8_t uResult = *puDst ^ uSrc;
974 *puDst = uResult;
975 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
976 return fEFlags;
977}
978
979# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
980
981/*
982 * AND
983 */
984
985IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u64,(uint32_t fEFlags, uint64_t *puDst, uint64_t uSrc))
986{
987 uint64_t const uResult = *puDst & uSrc;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
990 return fEFlags;
991}
992
993# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994
995IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u32,(uint32_t fEFlags, uint32_t *puDst, uint32_t uSrc))
996{
997 uint32_t const uResult = *puDst & uSrc;
998 *puDst = uResult;
999 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1000 return fEFlags;
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u16,(uint32_t fEFlags, uint16_t *puDst, uint16_t uSrc))
1005{
1006 uint16_t const uResult = *puDst & uSrc;
1007 *puDst = uResult;
1008 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1009 return fEFlags;
1010}
1011
1012
1013IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_and_u8,(uint32_t fEFlags, uint8_t *puDst, uint8_t uSrc))
1014{
1015 uint8_t const uResult = *puDst & uSrc;
1016 *puDst = uResult;
1017 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1018 return fEFlags;
1019}
1020
1021# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1022#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1023
1024/*
1025 * ANDN (BMI1 instruction)
1026 */
1027
1028IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1029{
1030 uint64_t const uResult = ~uSrc1 & uSrc2;
1031 *puDst = uResult;
1032 uint32_t fEFlags = *pfEFlags;
1033 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1034 *pfEFlags = fEFlags;
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1039{
1040 uint32_t const uResult = ~uSrc1 & uSrc2;
1041 *puDst = uResult;
1042 uint32_t fEFlags = *pfEFlags;
1043 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1044 *pfEFlags = fEFlags;
1045}
1046
1047
1048#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1049IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
1050{
1051 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1052}
1053#endif
1054
1055
1056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1057IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1058{
1059 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1060}
1061#endif
1062
1063#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1064
1065/*
1066 * CMP
1067 */
1068
1069IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1070{
1071 uint64_t uDstTmp = *puDst;
1072 return iemAImpl_sub_u64(fEFlags, &uDstTmp, uSrc);
1073}
1074
1075# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1076
1077IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1078{
1079 uint32_t uDstTmp = *puDst;
1080 return iemAImpl_sub_u32(fEFlags, &uDstTmp, uSrc);
1081}
1082
1083
1084IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1085{
1086 uint16_t uDstTmp = *puDst;
1087 return iemAImpl_sub_u16(fEFlags, &uDstTmp, uSrc);
1088}
1089
1090
1091IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmp_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1092{
1093 uint8_t uDstTmp = *puDst;
1094 return iemAImpl_sub_u8(fEFlags, &uDstTmp, uSrc);
1095}
1096
1097# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1098
1099/*
1100 * TEST
1101 */
1102
1103IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u64,(uint32_t fEFlags, uint64_t const *puDst, uint64_t uSrc))
1104{
1105 uint64_t uResult = *puDst & uSrc;
1106 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 64, 0);
1107 return fEFlags;
1108}
1109
1110# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1111
1112IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u32,(uint32_t fEFlags, uint32_t const *puDst, uint32_t uSrc))
1113{
1114 uint32_t uResult = *puDst & uSrc;
1115 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 32, 0);
1116 return fEFlags;
1117}
1118
1119
1120IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u16,(uint32_t fEFlags, uint16_t const *puDst, uint16_t uSrc))
1121{
1122 uint16_t uResult = *puDst & uSrc;
1123 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 16, 0);
1124 return fEFlags;
1125}
1126
1127
1128IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_test_u8,(uint32_t fEFlags, uint8_t const *puDst, uint8_t uSrc))
1129{
1130 uint8_t uResult = *puDst & uSrc;
1131 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGICAL(fEFlags, uResult, 8, 0);
1132 return fEFlags;
1133}
1134
1135# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1136
1137
1138/*
1139 * LOCK prefixed variants of the above
1140 */
1141
1142/** 64-bit locked binary operand operation. */
1143# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1144 do { \
1145 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1146 uint ## a_cBitsWidth ## _t uTmp; \
1147 uint32_t fEflTmp; \
1148 do \
1149 { \
1150 uTmp = uOld; \
1151 fEflTmp = iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(fEFlagsIn, &uTmp, uSrc); \
1152 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1153 return fEflTmp; \
1154 } while (0)
1155
1156
1157#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1158 IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint32_t fEFlagsIn, \
1159 uint ## a_cBitsWidth ## _t *puDst, \
1160 uint ## a_cBitsWidth ## _t uSrc)) \
1161 { \
1162 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1163 }
1164
1165EMIT_LOCKED_BIN_OP(add, 64)
1166EMIT_LOCKED_BIN_OP(adc, 64)
1167EMIT_LOCKED_BIN_OP(sub, 64)
1168EMIT_LOCKED_BIN_OP(sbb, 64)
1169EMIT_LOCKED_BIN_OP(or, 64)
1170EMIT_LOCKED_BIN_OP(xor, 64)
1171EMIT_LOCKED_BIN_OP(and, 64)
1172# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1173EMIT_LOCKED_BIN_OP(add, 32)
1174EMIT_LOCKED_BIN_OP(adc, 32)
1175EMIT_LOCKED_BIN_OP(sub, 32)
1176EMIT_LOCKED_BIN_OP(sbb, 32)
1177EMIT_LOCKED_BIN_OP(or, 32)
1178EMIT_LOCKED_BIN_OP(xor, 32)
1179EMIT_LOCKED_BIN_OP(and, 32)
1180
1181EMIT_LOCKED_BIN_OP(add, 16)
1182EMIT_LOCKED_BIN_OP(adc, 16)
1183EMIT_LOCKED_BIN_OP(sub, 16)
1184EMIT_LOCKED_BIN_OP(sbb, 16)
1185EMIT_LOCKED_BIN_OP(or, 16)
1186EMIT_LOCKED_BIN_OP(xor, 16)
1187EMIT_LOCKED_BIN_OP(and, 16)
1188
1189EMIT_LOCKED_BIN_OP(add, 8)
1190EMIT_LOCKED_BIN_OP(adc, 8)
1191EMIT_LOCKED_BIN_OP(sub, 8)
1192EMIT_LOCKED_BIN_OP(sbb, 8)
1193EMIT_LOCKED_BIN_OP(or, 8)
1194EMIT_LOCKED_BIN_OP(xor, 8)
1195EMIT_LOCKED_BIN_OP(and, 8)
1196# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1197
1198
1199/*
1200 * Bit operations (same signature as above).
1201 */
1202
1203/*
1204 * BT
1205 */
1206
1207IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t const *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1208{
1209 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1210 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1211 Assert(uSrc < 64);
1212 uint64_t uDst = *puDst;
1213 if (uDst & RT_BIT_64(uSrc))
1214 *pfEFlags |= X86_EFL_CF;
1215 else
1216 *pfEFlags &= ~X86_EFL_CF;
1217}
1218
1219# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1220
1221IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t const *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1222{
1223 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1224 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1225 Assert(uSrc < 32);
1226 uint32_t uDst = *puDst;
1227 if (uDst & RT_BIT_32(uSrc))
1228 *pfEFlags |= X86_EFL_CF;
1229 else
1230 *pfEFlags &= ~X86_EFL_CF;
1231}
1232
1233IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t const *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1234{
1235 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1236 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1237 Assert(uSrc < 16);
1238 uint16_t uDst = *puDst;
1239 if (uDst & RT_BIT_32(uSrc))
1240 *pfEFlags |= X86_EFL_CF;
1241 else
1242 *pfEFlags &= ~X86_EFL_CF;
1243}
1244
1245# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1246
1247/*
1248 * BTC
1249 */
1250
1251IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1252{
1253 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1254 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1255 Assert(uSrc < 64);
1256 uint64_t fMask = RT_BIT_64(uSrc);
1257 uint64_t uDst = *puDst;
1258 if (uDst & fMask)
1259 {
1260 uDst &= ~fMask;
1261 *puDst = uDst;
1262 *pfEFlags |= X86_EFL_CF;
1263 }
1264 else
1265 {
1266 uDst |= fMask;
1267 *puDst = uDst;
1268 *pfEFlags &= ~X86_EFL_CF;
1269 }
1270}
1271
1272# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1273
1274IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1275{
1276 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1277 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1278 Assert(uSrc < 32);
1279 uint32_t fMask = RT_BIT_32(uSrc);
1280 uint32_t uDst = *puDst;
1281 if (uDst & fMask)
1282 {
1283 uDst &= ~fMask;
1284 *puDst = uDst;
1285 *pfEFlags |= X86_EFL_CF;
1286 }
1287 else
1288 {
1289 uDst |= fMask;
1290 *puDst = uDst;
1291 *pfEFlags &= ~X86_EFL_CF;
1292 }
1293}
1294
1295
1296IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1297{
1298 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1299 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1300 Assert(uSrc < 16);
1301 uint16_t fMask = RT_BIT_32(uSrc);
1302 uint16_t uDst = *puDst;
1303 if (uDst & fMask)
1304 {
1305 uDst &= ~fMask;
1306 *puDst = uDst;
1307 *pfEFlags |= X86_EFL_CF;
1308 }
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1318
1319/*
1320 * BTR
1321 */
1322
1323IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1324{
1325 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1326 logical operation (AND/OR/whatever). */
1327 Assert(uSrc < 64);
1328 uint64_t fMask = RT_BIT_64(uSrc);
1329 uint64_t uDst = *puDst;
1330 if (uDst & fMask)
1331 {
1332 uDst &= ~fMask;
1333 *puDst = uDst;
1334 *pfEFlags |= X86_EFL_CF;
1335 }
1336 else
1337 *pfEFlags &= ~X86_EFL_CF;
1338}
1339
1340# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1341
1342IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1343{
1344 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1345 logical operation (AND/OR/whatever). */
1346 Assert(uSrc < 32);
1347 uint32_t fMask = RT_BIT_32(uSrc);
1348 uint32_t uDst = *puDst;
1349 if (uDst & fMask)
1350 {
1351 uDst &= ~fMask;
1352 *puDst = uDst;
1353 *pfEFlags |= X86_EFL_CF;
1354 }
1355 else
1356 *pfEFlags &= ~X86_EFL_CF;
1357}
1358
1359
1360IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1361{
1362 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1363 logical operation (AND/OR/whatever). */
1364 Assert(uSrc < 16);
1365 uint16_t fMask = RT_BIT_32(uSrc);
1366 uint16_t uDst = *puDst;
1367 if (uDst & fMask)
1368 {
1369 uDst &= ~fMask;
1370 *puDst = uDst;
1371 *pfEFlags |= X86_EFL_CF;
1372 }
1373 else
1374 *pfEFlags &= ~X86_EFL_CF;
1375}
1376
1377# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1378
1379/*
1380 * BTS
1381 */
1382
1383IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1384{
1385 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1386 logical operation (AND/OR/whatever). */
1387 Assert(uSrc < 64);
1388 uint64_t fMask = RT_BIT_64(uSrc);
1389 uint64_t uDst = *puDst;
1390 if (uDst & fMask)
1391 *pfEFlags |= X86_EFL_CF;
1392 else
1393 {
1394 uDst |= fMask;
1395 *puDst = uDst;
1396 *pfEFlags &= ~X86_EFL_CF;
1397 }
1398}
1399
1400# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1401
1402IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1403{
1404 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1405 logical operation (AND/OR/whatever). */
1406 Assert(uSrc < 32);
1407 uint32_t fMask = RT_BIT_32(uSrc);
1408 uint32_t uDst = *puDst;
1409 if (uDst & fMask)
1410 *pfEFlags |= X86_EFL_CF;
1411 else
1412 {
1413 uDst |= fMask;
1414 *puDst = uDst;
1415 *pfEFlags &= ~X86_EFL_CF;
1416 }
1417}
1418
1419
1420IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1421{
1422 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1423 logical operation (AND/OR/whatever). */
1424 Assert(uSrc < 16);
1425 uint16_t fMask = RT_BIT_32(uSrc);
1426 uint32_t uDst = *puDst;
1427 if (uDst & fMask)
1428 *pfEFlags |= X86_EFL_CF;
1429 else
1430 {
1431 uDst |= fMask;
1432 *puDst = uDst;
1433 *pfEFlags &= ~X86_EFL_CF;
1434 }
1435}
1436
1437# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1438
1439
1440/** 64-bit locked binary operand operation. */
1441# define DO_LOCKED_BIN_TODO_OP(a_Mnemonic, a_cBitsWidth) \
1442 do { \
1443 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1444 uint ## a_cBitsWidth ## _t uTmp; \
1445 uint32_t fEflTmp; \
1446 do \
1447 { \
1448 uTmp = uOld; \
1449 fEflTmp = *pfEFlags; \
1450 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1451 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1452 *pfEFlags = fEflTmp; \
1453 } while (0)
1454
1455
1456#define EMIT_LOCKED_BIN_TODO_OP(a_Mnemonic, a_cBitsWidth) \
1457 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1458 uint ## a_cBitsWidth ## _t uSrc, \
1459 uint32_t *pfEFlags)) \
1460 { \
1461 DO_LOCKED_BIN_TODO_OP(a_Mnemonic, a_cBitsWidth); \
1462 }
1463
1464
1465EMIT_LOCKED_BIN_TODO_OP(btc, 64)
1466EMIT_LOCKED_BIN_TODO_OP(btr, 64)
1467EMIT_LOCKED_BIN_TODO_OP(bts, 64)
1468# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1469EMIT_LOCKED_BIN_TODO_OP(btc, 32)
1470EMIT_LOCKED_BIN_TODO_OP(btr, 32)
1471EMIT_LOCKED_BIN_TODO_OP(bts, 32)
1472
1473EMIT_LOCKED_BIN_TODO_OP(btc, 16)
1474EMIT_LOCKED_BIN_TODO_OP(btr, 16)
1475EMIT_LOCKED_BIN_TODO_OP(bts, 16)
1476# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1477
1478#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1479
1480/*
1481 * Helpers for BSR and BSF.
1482 *
1483 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1484 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1485 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1486 * but we restrict ourselves to emulating these recent marchs.
1487 */
1488#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1489 unsigned iBit = (a_iBit); \
1490 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1491 if (iBit) \
1492 { \
1493 *puDst = --iBit; \
1494 fEfl |= IEM_EFL_CALC_PARITY(iBit); \
1495 } \
1496 else \
1497 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1498 *pfEFlags = fEfl; \
1499 } while (0)
1500#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1501 unsigned const iBit = (a_iBit); \
1502 if (iBit) \
1503 { \
1504 *puDst = iBit - 1; \
1505 *pfEFlags &= ~X86_EFL_ZF; \
1506 } \
1507 else \
1508 *pfEFlags |= X86_EFL_ZF; \
1509 } while (0)
1510
1511/*
1512 * BSF - first (least significant) bit set
1513 */
1514#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1515IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1516{
1517 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1518}
1519#endif
1520
1521IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1522{
1523 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1524}
1525
1526IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1527{
1528 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1529}
1530
1531#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1532IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1533{
1534 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1535}
1536#endif
1537
1538IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1539{
1540 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1541}
1542
1543IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1544{
1545 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1546}
1547
1548
1549#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1550IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1551{
1552 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1553}
1554#endif
1555
1556IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1557{
1558 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1559}
1560
1561IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1562{
1563 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1564}
1565
1566
1567
1568/*
1569 * BSR - last (most significant) bit set
1570 */
1571#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1572IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1573{
1574 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1575}
1576#endif
1577
1578IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1579{
1580 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1581}
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1584{
1585 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1586}
1587
1588
1589#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1590IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1591{
1592 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1593}
1594#endif
1595
1596IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1597{
1598 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1599}
1600
1601IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1602{
1603 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1604}
1605
1606
1607#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1608IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1609{
1610 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1611}
1612#endif
1613
1614IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1615{
1616 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1617}
1618
1619IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1620{
1621 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1622}
1623
1624
1625/*
1626 * Helpers for LZCNT and TZCNT.
1627 */
1628#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1629 unsigned const uResult = (a_uResult); \
1630 *(a_puDst) = uResult; \
1631 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1632 if (uResult) \
1633 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
1634 else \
1635 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1636 if (!a_uSrc) \
1637 fEfl |= X86_EFL_CF; \
1638 *(a_pfEFlags) = fEfl; \
1639 } while (0)
1640#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1641 unsigned const uResult = (a_uResult); \
1642 *(a_puDst) = uResult; \
1643 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1644 if (!uResult) \
1645 fEfl |= X86_EFL_ZF; \
1646 if (!a_uSrc) \
1647 fEfl |= X86_EFL_CF; \
1648 *(a_pfEFlags) = fEfl; \
1649 } while (0)
1650
1651
1652/*
1653 * LZCNT - count leading zero bits.
1654 */
1655#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1656IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1657{
1658 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1659}
1660#endif
1661
1662IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1663{
1664 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1665}
1666
1667IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1668{
1669 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1670}
1671
1672
1673#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1674IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1675{
1676 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1677}
1678#endif
1679
1680IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1681{
1682 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1683}
1684
1685IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1686{
1687 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1688}
1689
1690
1691#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1692IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1693{
1694 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1695}
1696#endif
1697
1698IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1699{
1700 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1701}
1702
1703IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1704{
1705 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1706}
1707
1708
1709/*
1710 * TZCNT - count leading zero bits.
1711 */
1712#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1713IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1714{
1715 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1716}
1717#endif
1718
1719IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1720{
1721 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1722}
1723
1724IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1725{
1726 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1727}
1728
1729
1730#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1731IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1732{
1733 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1734}
1735#endif
1736
1737IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1738{
1739 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1740}
1741
1742IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1743{
1744 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1745}
1746
1747
1748#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1749IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1750{
1751 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1752}
1753#endif
1754
1755IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1756{
1757 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1758}
1759
1760IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1761{
1762 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1763}
1764
1765
1766
1767/*
1768 * BEXTR (BMI1 instruction)
1769 */
1770#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1771IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1772 a_Type uSrc2, uint32_t *pfEFlags)) \
1773{ \
1774 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1775 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1776 a_Type uResult; \
1777 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1778 if (iFirstBit < a_cBits) \
1779 { \
1780 uResult = uSrc1 >> iFirstBit; \
1781 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1782 if (cBits < a_cBits) \
1783 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1784 *puDst = uResult; \
1785 if (!uResult) \
1786 fEfl |= X86_EFL_ZF; \
1787 } \
1788 else \
1789 { \
1790 *puDst = uResult = 0; \
1791 fEfl |= X86_EFL_ZF; \
1792 } \
1793 /** @todo complete flag calculations. */ \
1794 *pfEFlags = fEfl; \
1795}
1796
1797EMIT_BEXTR(64, uint64_t, _fallback)
1798EMIT_BEXTR(32, uint32_t, _fallback)
1799#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1800EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1801#endif
1802#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1803EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1804#endif
1805
1806/*
1807 * BLSR (BMI1 instruction)
1808 */
1809#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1810IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1811{ \
1812 uint32_t fEfl1 = *pfEFlags; \
1813 uint32_t fEfl2 = fEfl1; \
1814 *puDst = uSrc; \
1815 fEfl1 = iemAImpl_sub_u ## a_cBits(fEfl1, &uSrc, 1); \
1816 fEfl2 = iemAImpl_and_u ## a_cBits(fEfl2, puDst, uSrc); \
1817 \
1818 /* AMD: The carry flag is from the SUB operation. */ \
1819 /* 10890xe: PF always cleared? */ \
1820 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1821 fEfl2 |= fEfl1 & X86_EFL_CF; \
1822 *pfEFlags = fEfl2; \
1823}
1824
1825EMIT_BLSR(64, uint64_t, _fallback)
1826EMIT_BLSR(32, uint32_t, _fallback)
1827#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1828EMIT_BLSR(64, uint64_t, RT_NOTHING)
1829#endif
1830#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1831EMIT_BLSR(32, uint32_t, RT_NOTHING)
1832#endif
1833
1834/*
1835 * BLSMSK (BMI1 instruction)
1836 */
1837#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1838IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1839{ \
1840 uint32_t fEfl1 = *pfEFlags; \
1841 uint32_t fEfl2 = fEfl1; \
1842 *puDst = uSrc; \
1843 fEfl1 = iemAImpl_sub_u ## a_cBits(fEfl1, &uSrc, 1); \
1844 fEfl2 = iemAImpl_xor_u ## a_cBits(fEfl2, puDst, uSrc); \
1845 \
1846 /* AMD: The carry flag is from the SUB operation. */ \
1847 /* 10890xe: PF always cleared? */ \
1848 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1849 fEfl2 |= fEfl1 & X86_EFL_CF; \
1850 *pfEFlags = fEfl2; \
1851}
1852
1853EMIT_BLSMSK(64, uint64_t, _fallback)
1854EMIT_BLSMSK(32, uint32_t, _fallback)
1855#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1856EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1857#endif
1858#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1859EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1860#endif
1861
1862/*
1863 * BLSI (BMI1 instruction)
1864 */
1865#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1866IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1867{ \
1868 uint32_t fEfl1 = *pfEFlags; \
1869 uint32_t fEfl2 = fEfl1; \
1870 *puDst = uSrc; \
1871 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1872 fEfl2 = iemAImpl_and_u ## a_cBits(fEfl2, puDst, uSrc); \
1873 \
1874 /* AMD: The carry flag is from the SUB operation. */ \
1875 /* 10890xe: PF always cleared? */ \
1876 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1877 fEfl2 |= fEfl1 & X86_EFL_CF; \
1878 *pfEFlags = fEfl2; \
1879}
1880
1881EMIT_BLSI(64, uint64_t, _fallback)
1882EMIT_BLSI(32, uint32_t, _fallback)
1883#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1884EMIT_BLSI(64, uint64_t, RT_NOTHING)
1885#endif
1886#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1887EMIT_BLSI(32, uint32_t, RT_NOTHING)
1888#endif
1889
1890/*
1891 * BZHI (BMI2 instruction)
1892 */
1893#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1894IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1895 a_Type uSrc2, uint32_t *pfEFlags)) \
1896{ \
1897 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1898 a_Type uResult; \
1899 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1900 if (iFirstBit < a_cBits) \
1901 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1902 else \
1903 { \
1904 uResult = uSrc1; \
1905 fEfl |= X86_EFL_CF; \
1906 } \
1907 *puDst = uResult; \
1908 fEfl |= X86_EFL_CALC_ZF(uResult); \
1909 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1910 *pfEFlags = fEfl; \
1911}
1912
1913EMIT_BZHI(64, uint64_t, _fallback)
1914EMIT_BZHI(32, uint32_t, _fallback)
1915#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1916EMIT_BZHI(64, uint64_t, RT_NOTHING)
1917#endif
1918#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1919EMIT_BZHI(32, uint32_t, RT_NOTHING)
1920#endif
1921
1922/*
1923 * POPCNT
1924 */
1925RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1926{
1927 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1928 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1929 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1930 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1931};
1932
1933/** @todo Use native popcount where possible and employ some more efficient
1934 * algorithm here (or in asm.h fallback)! */
1935
1936DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1937{
1938 return g_abBitCounts6[ u16 & 0x3f]
1939 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1940 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1941}
1942
1943DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1944{
1945 return g_abBitCounts6[ u32 & 0x3f]
1946 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1947 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1948 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1949 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1950 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1951}
1952
1953DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1954{
1955 return g_abBitCounts6[ u64 & 0x3f]
1956 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1957 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1958 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1959 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1960 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1961 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1962 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1963 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1964 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1965 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1966}
1967
1968#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1969IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1970{ \
1971 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1972 a_Type uResult; \
1973 if (uSrc) \
1974 uResult = iemPopCountU ## a_cBits(uSrc); \
1975 else \
1976 { \
1977 fEfl |= X86_EFL_ZF; \
1978 uResult = 0; \
1979 } \
1980 *puDst = uResult; \
1981 *pfEFlags = fEfl; \
1982}
1983
1984EMIT_POPCNT(64, uint64_t, _fallback)
1985EMIT_POPCNT(32, uint32_t, _fallback)
1986EMIT_POPCNT(16, uint16_t, _fallback)
1987#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1988EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1989#endif
1990#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1991EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1992EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1993#endif
1994
1995
1996#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1997
1998/*
1999 * XCHG
2000 */
2001
2002IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
2003{
2004#if ARCH_BITS >= 64
2005 *puReg = ASMAtomicXchgU64(puMem, *puReg);
2006#else
2007 uint64_t uOldMem = *puMem;
2008 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
2009 ASMNopPause();
2010 *puReg = uOldMem;
2011#endif
2012}
2013
2014# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2015
2016IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
2017{
2018 *puReg = ASMAtomicXchgU32(puMem, *puReg);
2019}
2020
2021
2022IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
2023{
2024 *puReg = ASMAtomicXchgU16(puMem, *puReg);
2025}
2026
2027
2028IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
2029{
2030 *puReg = ASMAtomicXchgU8(puMem, *puReg);
2031}
2032
2033# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2034
2035
2036/* Unlocked variants for fDisregardLock mode: */
2037
2038IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
2039{
2040 uint64_t const uOld = *puMem;
2041 *puMem = *puReg;
2042 *puReg = uOld;
2043}
2044
2045# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2046
2047IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
2048{
2049 uint32_t const uOld = *puMem;
2050 *puMem = *puReg;
2051 *puReg = uOld;
2052}
2053
2054
2055IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
2056{
2057 uint16_t const uOld = *puMem;
2058 *puMem = *puReg;
2059 *puReg = uOld;
2060}
2061
2062
2063IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
2064{
2065 uint8_t const uOld = *puMem;
2066 *puMem = *puReg;
2067 *puReg = uOld;
2068}
2069
2070# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2071
2072
2073/*
2074 * XADD and LOCK XADD.
2075 */
2076#define EMIT_XADD(a_cBitsWidth, a_Type) \
2077IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2078{ \
2079 a_Type uDst = *puDst; \
2080 a_Type uResult = uDst; \
2081 *pfEFlags = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2082 *puDst = uResult; \
2083 *puReg = uDst; \
2084} \
2085\
2086IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
2087{ \
2088 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2089 a_Type uResult; \
2090 uint32_t fEflTmp; \
2091 do \
2092 { \
2093 uResult = uOld; \
2094 fEflTmp = iemAImpl_add_u ## a_cBitsWidth(*pfEFlags, &uResult, *puReg); \
2095 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2096 *puReg = uOld; \
2097 *pfEFlags = fEflTmp; \
2098}
2099EMIT_XADD(64, uint64_t)
2100# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2101EMIT_XADD(32, uint32_t)
2102EMIT_XADD(16, uint16_t)
2103EMIT_XADD(8, uint8_t)
2104# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2105
2106#endif
2107
2108/*
2109 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2110 *
2111 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2112 * instructions are emulated as locked.
2113 */
2114#if defined(IEM_WITHOUT_ASSEMBLY)
2115
2116IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2117{
2118 uint8_t uOld = *puAl;
2119 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2120 Assert(*puAl == uOld);
2121 *pEFlags = iemAImpl_cmp_u8(*pEFlags, &uOld, *puAl);
2122}
2123
2124
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2126{
2127 uint16_t uOld = *puAx;
2128 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2129 Assert(*puAx == uOld);
2130 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, *puAx);
2131}
2132
2133
2134IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2135{
2136 uint32_t uOld = *puEax;
2137 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2138 Assert(*puEax == uOld);
2139 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, *puEax);
2140}
2141
2142
2143# if ARCH_BITS == 32
2144IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2145# else
2146IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2147# endif
2148{
2149# if ARCH_BITS == 32
2150 uint64_t const uSrcReg = *puSrcReg;
2151# endif
2152 uint64_t uOld = *puRax;
2153 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2154 Assert(*puRax == uOld);
2155 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, *puRax);
2156}
2157
2158
2159IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2160 uint32_t *pEFlags))
2161{
2162 uint64_t const uNew = pu64EbxEcx->u;
2163 uint64_t const uOld = pu64EaxEdx->u;
2164 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2165 {
2166 Assert(pu64EaxEdx->u == uOld);
2167 *pEFlags |= X86_EFL_ZF;
2168 }
2169 else
2170 *pEFlags &= ~X86_EFL_ZF;
2171}
2172
2173
2174# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2175IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2176 uint32_t *pEFlags))
2177{
2178# ifdef VBOX_STRICT
2179 RTUINT128U const uOld = *pu128RaxRdx;
2180# endif
2181# if defined(RT_ARCH_AMD64)
2182 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2183 &pu128RaxRdx->u))
2184# else
2185 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2186# endif
2187 {
2188 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2189 *pEFlags |= X86_EFL_ZF;
2190 }
2191 else
2192 *pEFlags &= ~X86_EFL_ZF;
2193}
2194# endif
2195
2196#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2197
2198# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2199IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2200 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2201{
2202 RTUINT128U u128Tmp = *pu128Dst;
2203 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2204 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2205 {
2206 *pu128Dst = *pu128RbxRcx;
2207 *pEFlags |= X86_EFL_ZF;
2208 }
2209 else
2210 {
2211 *pu128RaxRdx = u128Tmp;
2212 *pEFlags &= ~X86_EFL_ZF;
2213 }
2214}
2215#endif /* !RT_ARCH_ARM64 */
2216
2217#if defined(IEM_WITHOUT_ASSEMBLY)
2218
2219/* Unlocked versions mapped to the locked ones: */
2220
2221IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2222{
2223 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2224}
2225
2226
2227IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2228{
2229# if 0
2230 /* If correctly aligned, used the locked variation. */
2231 if (!((uintptr_t)pu16Dst & 1))
2232 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2233 else
2234# endif
2235 {
2236 /* Otherwise emulate it as best as we can. */
2237 uint16_t const uOld = *puAx;
2238 uint16_t const uDst = *pu16Dst;
2239 if (uOld == uDst)
2240 {
2241 *pu16Dst = uSrcReg;
2242 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uOld);
2243 }
2244 else
2245 {
2246 *puAx = uDst;
2247 *pEFlags = iemAImpl_cmp_u16(*pEFlags, &uOld, uDst);
2248 }
2249 }
2250}
2251
2252
2253IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2254{
2255# if 0
2256 /* If correctly aligned, used the locked variation. */
2257 if (!((uintptr_t)pu32Dst & 3))
2258 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2259 else
2260# endif
2261 {
2262 /* Otherwise emulate it as best as we can. */
2263 uint32_t const uOld = *puEax;
2264 uint32_t const uDst = *pu32Dst;
2265 if (uOld == uDst)
2266 {
2267 *pu32Dst = uSrcReg;
2268 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uOld);
2269 }
2270 else
2271 {
2272 *puEax = uDst;
2273 *pEFlags = iemAImpl_cmp_u32(*pEFlags, &uOld, uDst);
2274 }
2275 }
2276}
2277
2278
2279# if ARCH_BITS == 32
2280IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2281{
2282# if 0
2283 /* If correctly aligned, used the locked variation. */
2284 if (!((uintptr_t)pu32Dst & 7))
2285 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2286 else
2287# endif
2288 {
2289 /* Otherwise emulate it as best as we can. */
2290 uint64_t const uOld = *puRax;
2291 uint64_t const uSrc = *puSrcReg;
2292 uint64_t const uDst = *pu64Dst;
2293 if (uOld == uDst)
2294 {
2295 *pu64Dst = uSrc;
2296 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2297 }
2298 else
2299 {
2300 *puRax = uDst;
2301 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2302 }
2303 }
2304}
2305# else /* ARCH_BITS != 32 */
2306IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2307{
2308# if 0
2309 /* If correctly aligned, used the locked variation. */
2310 if (!((uintptr_t)pu64Dst & 7))
2311 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2312 else
2313# endif
2314 {
2315 /* Otherwise emulate it as best as we can. */
2316 uint64_t const uOld = *puRax;
2317 uint64_t const uDst = *pu64Dst;
2318 if (uOld == uDst)
2319 {
2320 *pu64Dst = uSrcReg;
2321 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uOld);
2322 }
2323 else
2324 {
2325 *puRax = uDst;
2326 *pEFlags = iemAImpl_cmp_u64(*pEFlags, &uOld, uDst);
2327 }
2328 }
2329}
2330# endif /* ARCH_BITS != 32 */
2331
2332
2333IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2334{
2335# if 0
2336 /* If correctly aligned, used the locked variation. */
2337 if (!((uintptr_t)pu64Dst & 7))
2338 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2339 else
2340# endif
2341 {
2342 /* Otherwise emulate it as best as we can. */
2343 uint64_t const uNew = pu64EbxEcx->u;
2344 uint64_t const uOld = pu64EaxEdx->u;
2345 uint64_t const uDst = *pu64Dst;
2346 if (uDst == uOld)
2347 {
2348 *pu64Dst = uNew;
2349 *pEFlags |= X86_EFL_ZF;
2350 }
2351 else
2352 {
2353 pu64EaxEdx->u = uDst;
2354 *pEFlags &= ~X86_EFL_ZF;
2355 }
2356 }
2357}
2358
2359
2360IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2361 uint32_t *pEFlags))
2362{
2363# if 0
2364 /* If correctly aligned, used the locked variation. */
2365 if (!((uintptr_t)pu64Dst & 15))
2366 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2367 else
2368# endif
2369 {
2370 /* Otherwise emulate it as best as we can. */
2371# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2372 uint128_t const uNew = pu128RbxRcx->u;
2373 uint128_t const uOld = pu128RaxRdx->u;
2374 uint128_t const uDst = pu128Dst->u;
2375 if (uDst == uOld)
2376 {
2377 pu128Dst->u = uNew;
2378 *pEFlags |= X86_EFL_ZF;
2379 }
2380 else
2381 {
2382 pu128RaxRdx->u = uDst;
2383 *pEFlags &= ~X86_EFL_ZF;
2384 }
2385# else
2386 RTUINT128U const uNew = *pu128RbxRcx;
2387 RTUINT128U const uOld = *pu128RaxRdx;
2388 RTUINT128U const uDst = *pu128Dst;
2389 if ( uDst.s.Lo == uOld.s.Lo
2390 && uDst.s.Hi == uOld.s.Hi)
2391 {
2392 *pu128Dst = uNew;
2393 *pEFlags |= X86_EFL_ZF;
2394 }
2395 else
2396 {
2397 *pu128RaxRdx = uDst;
2398 *pEFlags &= ~X86_EFL_ZF;
2399 }
2400# endif
2401 }
2402}
2403
2404#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2405
2406#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2407 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2408
2409/*
2410 * MUL, IMUL, DIV and IDIV helpers.
2411 *
2412 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2413 * division step so we can select between using C operators and
2414 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2415 *
2416 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2417 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2418 * input loads and the result storing.
2419 */
2420
2421DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2422{
2423# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2424 pQuotient->s.Lo = 0;
2425 pQuotient->s.Hi = 0;
2426# endif
2427 RTUINT128U Divisor;
2428 Divisor.s.Lo = u64Divisor;
2429 Divisor.s.Hi = 0;
2430 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2431}
2432
2433# define DIV_LOAD(a_Dividend) \
2434 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2435# define DIV_LOAD_U8(a_Dividend) \
2436 a_Dividend.u = *puAX
2437
2438# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2439# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2440
2441# define MUL_LOAD_F1() *puA
2442# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2443
2444# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2445# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2446
2447# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2448 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2449# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2450 RTUInt128AssignNeg(&(a_Value))
2451
2452# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2453 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2454# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2455 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2456
2457# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2458 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2459 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2460# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2461 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2462
2463
2464/*
2465 * MUL
2466 */
2467# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2468IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2469{ \
2470 RTUINT ## a_cBitsWidth2x ## U Result; \
2471 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2472 a_fnStore(Result); \
2473 \
2474 /* Calc EFLAGS: */ \
2475 uint32_t fEfl = *pfEFlags; \
2476 if (a_fIntelFlags) \
2477 { /* Intel: 6700K and 10980XE behavior */ \
2478 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2479 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2480 fEfl |= X86_EFL_SF; \
2481 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo); \
2482 if (Result.s.Hi != 0) \
2483 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2484 } \
2485 else \
2486 { /* AMD: 3990X */ \
2487 if (Result.s.Hi != 0) \
2488 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2489 else \
2490 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2491 } \
2492 *pfEFlags = fEfl; \
2493 return 0; \
2494} \
2495
2496# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2497 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2498 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2499 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2500
2501# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2502EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2503 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2504# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2505EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2506 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2507EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2508 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2509EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2510 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2511# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2512# endif /* !DOXYGEN_RUNNING */
2513
2514/*
2515 * MULX
2516 */
2517# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2518IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2519 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2520{ \
2521 RTUINT ## a_cBitsWidth2x ## U Result; \
2522 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2523 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2524 *puDst1 = Result.s.Hi; \
2525} \
2526
2527# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2528EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2529EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2530# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2531EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2532EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2534# endif /* !DOXYGEN_RUNNING */
2535
2536
2537/*
2538 * IMUL
2539 *
2540 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2541 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2542 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2543 */
2544# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2545 a_Suffix, a_fIntelFlags) \
2546IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2547{ \
2548 RTUINT ## a_cBitsWidth2x ## U Result; \
2549 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2550 \
2551 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2552 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2553 { \
2554 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2555 { \
2556 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2557 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2558 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2559 } \
2560 else \
2561 { \
2562 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2563 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2564 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2565 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2566 a_fnNeg(Result, a_cBitsWidth2x); \
2567 } \
2568 } \
2569 else \
2570 { \
2571 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2572 { \
2573 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2574 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2575 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2576 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2577 a_fnNeg(Result, a_cBitsWidth2x); \
2578 } \
2579 else \
2580 { \
2581 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2582 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2583 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2584 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2585 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2586 } \
2587 } \
2588 a_fnStore(Result); \
2589 \
2590 if (a_fIntelFlags) \
2591 { \
2592 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2593 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2594 fEfl |= X86_EFL_SF; \
2595 fEfl |= IEM_EFL_CALC_PARITY(Result.s.Lo & 0xff); \
2596 } \
2597 *pfEFlags = fEfl; \
2598 return 0; \
2599}
2600# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2601 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2602 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2603 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2604
2605# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2606EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2607 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2608# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2609EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2610 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2611EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2612 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2613EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2614 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2615# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2616# endif /* !DOXYGEN_RUNNING */
2617
2618
2619/*
2620 * IMUL with two operands are mapped onto the three operand variant, ignoring
2621 * the high part of the product.
2622 */
2623# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2624IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2625{ \
2626 a_uType uIgn; \
2627 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2628} \
2629\
2630IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2631{ \
2632 a_uType uIgn; \
2633 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2634} \
2635\
2636IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2637{ \
2638 a_uType uIgn; \
2639 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2640}
2641
2642EMIT_IMUL_TWO(64, uint64_t)
2643# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2644EMIT_IMUL_TWO(32, uint32_t)
2645EMIT_IMUL_TWO(16, uint16_t)
2646# endif
2647
2648
2649/*
2650 * DIV
2651 */
2652# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2653 a_Suffix, a_fIntelFlags) \
2654IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2655{ \
2656 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2657 a_fnLoad(Dividend); \
2658 if ( uDivisor != 0 \
2659 && Dividend.s.Hi < uDivisor) \
2660 { \
2661 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2662 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2663 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2664 \
2665 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2666 if (!a_fIntelFlags) \
2667 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2668 return 0; \
2669 } \
2670 /* #DE */ \
2671 return -1; \
2672}
2673# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2674 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2675 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2676 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2677
2678# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2679EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2680 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2681# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2682EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2683 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2684EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2685 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2686EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2687 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2688# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2689# endif /* !DOXYGEN_RUNNING */
2690
2691
2692/*
2693 * IDIV
2694 *
2695 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2696 * set AF and clear PF, ZF and SF just like it does for DIV.
2697 *
2698 */
2699# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2700 a_Suffix, a_fIntelFlags) \
2701IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2702{ \
2703 /* Note! Skylake leaves all flags alone. */ \
2704 \
2705 /** @todo overflow checks */ \
2706 if (uDivisor != 0) \
2707 { \
2708 /* \
2709 * Convert to unsigned division. \
2710 */ \
2711 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2712 a_fnLoad(Dividend); \
2713 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2714 if (fSignedDividend) \
2715 a_fnNeg(Dividend, a_cBitsWidth2x); \
2716 \
2717 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2718 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2719 uDivisorPositive = uDivisor; \
2720 else \
2721 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2722 \
2723 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2724 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2725 \
2726 /* \
2727 * Setup the result, checking for overflows. \
2728 */ \
2729 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2730 { \
2731 if (!fSignedDividend) \
2732 { \
2733 /* Positive divisor, positive dividend => result positive. */ \
2734 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2735 { \
2736 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2737 if (!a_fIntelFlags) \
2738 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2739 return 0; \
2740 } \
2741 } \
2742 else \
2743 { \
2744 /* Positive divisor, negative dividend => result negative. */ \
2745 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2746 { \
2747 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2748 if (!a_fIntelFlags) \
2749 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2750 return 0; \
2751 } \
2752 } \
2753 } \
2754 else \
2755 { \
2756 if (!fSignedDividend) \
2757 { \
2758 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2759 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2760 { \
2761 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2762 if (!a_fIntelFlags) \
2763 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2764 return 0; \
2765 } \
2766 } \
2767 else \
2768 { \
2769 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2770 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2771 { \
2772 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2773 if (!a_fIntelFlags) \
2774 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2775 return 0; \
2776 } \
2777 } \
2778 } \
2779 } \
2780 /* #DE */ \
2781 return -1; \
2782}
2783# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2784 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2785 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2786 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2787
2788# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2789EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2790 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2791# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2792EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2793 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2794EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2795 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2796EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2797 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2798# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2799# endif /* !DOXYGEN_RUNNING */
2800
2801#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2802
2803
2804/*********************************************************************************************************************************
2805* Unary operations. *
2806*********************************************************************************************************************************/
2807#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2808
2809/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2810 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2811 *
2812 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2813 * borrowing in arithmetic loops on intel 8008).
2814 *
2815 * @returns Status bits.
2816 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2817 * @param a_uResult Unsigned result value.
2818 * @param a_uDst The original destination value (for AF calc).
2819 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2820 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2821 */
2822#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2823 do { \
2824 uint32_t fEflTmp = *(a_pfEFlags); \
2825 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2826 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2827 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2828 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2829 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2830 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2831 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2832 *(a_pfEFlags) = fEflTmp; \
2833 } while (0)
2834
2835/*
2836 * INC
2837 */
2838
2839IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2840{
2841 uint64_t uDst = *puDst;
2842 uint64_t uResult = uDst + 1;
2843 *puDst = uResult;
2844 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2845}
2846
2847# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2848
2849IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2850{
2851 uint32_t uDst = *puDst;
2852 uint32_t uResult = uDst + 1;
2853 *puDst = uResult;
2854 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2855}
2856
2857
2858IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2859{
2860 uint16_t uDst = *puDst;
2861 uint16_t uResult = uDst + 1;
2862 *puDst = uResult;
2863 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2864}
2865
2866IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2867{
2868 uint8_t uDst = *puDst;
2869 uint8_t uResult = uDst + 1;
2870 *puDst = uResult;
2871 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2872}
2873
2874# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2875
2876
2877/*
2878 * DEC
2879 */
2880
2881IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2882{
2883 uint64_t uDst = *puDst;
2884 uint64_t uResult = uDst - 1;
2885 *puDst = uResult;
2886 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2887}
2888
2889# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2890
2891IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2892{
2893 uint32_t uDst = *puDst;
2894 uint32_t uResult = uDst - 1;
2895 *puDst = uResult;
2896 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2897}
2898
2899
2900IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2901{
2902 uint16_t uDst = *puDst;
2903 uint16_t uResult = uDst - 1;
2904 *puDst = uResult;
2905 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2906}
2907
2908
2909IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2910{
2911 uint8_t uDst = *puDst;
2912 uint8_t uResult = uDst - 1;
2913 *puDst = uResult;
2914 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2915}
2916
2917# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2918
2919
2920/*
2921 * NOT
2922 */
2923
2924IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2925{
2926 uint64_t uDst = *puDst;
2927 uint64_t uResult = ~uDst;
2928 *puDst = uResult;
2929 /* EFLAGS are not modified. */
2930 RT_NOREF_PV(pfEFlags);
2931}
2932
2933# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2934
2935IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2936{
2937 uint32_t uDst = *puDst;
2938 uint32_t uResult = ~uDst;
2939 *puDst = uResult;
2940 /* EFLAGS are not modified. */
2941 RT_NOREF_PV(pfEFlags);
2942}
2943
2944IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2945{
2946 uint16_t uDst = *puDst;
2947 uint16_t uResult = ~uDst;
2948 *puDst = uResult;
2949 /* EFLAGS are not modified. */
2950 RT_NOREF_PV(pfEFlags);
2951}
2952
2953IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2954{
2955 uint8_t uDst = *puDst;
2956 uint8_t uResult = ~uDst;
2957 *puDst = uResult;
2958 /* EFLAGS are not modified. */
2959 RT_NOREF_PV(pfEFlags);
2960}
2961
2962# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2963
2964
2965/*
2966 * NEG
2967 */
2968
2969/**
2970 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2971 *
2972 * @returns Status bits.
2973 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2974 * @param a_uResult Unsigned result value.
2975 * @param a_uDst The original destination value (for AF calc).
2976 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2977 */
2978#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2979 do { \
2980 uint32_t fEflTmp = *(a_pfEFlags); \
2981 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2982 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2983 fEflTmp |= IEM_EFL_CALC_PARITY(a_uResult); \
2984 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2985 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2986 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2987 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2988 *(a_pfEFlags) = fEflTmp; \
2989 } while (0)
2990
2991IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2992{
2993 uint64_t uDst = *puDst;
2994 uint64_t uResult = (uint64_t)0 - uDst;
2995 *puDst = uResult;
2996 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2997}
2998
2999# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3000
3001IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
3002{
3003 uint32_t uDst = *puDst;
3004 uint32_t uResult = (uint32_t)0 - uDst;
3005 *puDst = uResult;
3006 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
3007}
3008
3009
3010IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
3011{
3012 uint16_t uDst = *puDst;
3013 uint16_t uResult = (uint16_t)0 - uDst;
3014 *puDst = uResult;
3015 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
3016}
3017
3018
3019IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
3020{
3021 uint8_t uDst = *puDst;
3022 uint8_t uResult = (uint8_t)0 - uDst;
3023 *puDst = uResult;
3024 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
3025}
3026
3027# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3028
3029/*
3030 * Locked variants.
3031 */
3032
3033/** Emit a function for doing a locked unary operand operation. */
3034# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
3035 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
3036 uint32_t *pfEFlags)) \
3037 { \
3038 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
3039 uint ## a_cBitsWidth ## _t uTmp; \
3040 uint32_t fEflTmp; \
3041 do \
3042 { \
3043 uTmp = uOld; \
3044 fEflTmp = *pfEFlags; \
3045 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
3046 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
3047 *pfEFlags = fEflTmp; \
3048 }
3049
3050EMIT_LOCKED_UNARY_OP(inc, 64)
3051EMIT_LOCKED_UNARY_OP(dec, 64)
3052EMIT_LOCKED_UNARY_OP(not, 64)
3053EMIT_LOCKED_UNARY_OP(neg, 64)
3054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3055EMIT_LOCKED_UNARY_OP(inc, 32)
3056EMIT_LOCKED_UNARY_OP(dec, 32)
3057EMIT_LOCKED_UNARY_OP(not, 32)
3058EMIT_LOCKED_UNARY_OP(neg, 32)
3059
3060EMIT_LOCKED_UNARY_OP(inc, 16)
3061EMIT_LOCKED_UNARY_OP(dec, 16)
3062EMIT_LOCKED_UNARY_OP(not, 16)
3063EMIT_LOCKED_UNARY_OP(neg, 16)
3064
3065EMIT_LOCKED_UNARY_OP(inc, 8)
3066EMIT_LOCKED_UNARY_OP(dec, 8)
3067EMIT_LOCKED_UNARY_OP(not, 8)
3068EMIT_LOCKED_UNARY_OP(neg, 8)
3069# endif
3070
3071#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
3072
3073
3074/*********************************************************************************************************************************
3075* Shifting and Rotating *
3076*********************************************************************************************************************************/
3077
3078/*
3079 * ROL
3080 */
3081#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3082IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3083{ \
3084 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3085 if (cShift) \
3086 { \
3087 if (a_cBitsWidth < 32) \
3088 cShift &= a_cBitsWidth - 1; \
3089 a_uType const uDst = *puDst; \
3090 a_uType const uResult = a_fnHlp(uDst, cShift); \
3091 *puDst = uResult; \
3092 \
3093 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3094 it the same way as for 1 bit shifts. */ \
3095 AssertCompile(X86_EFL_CF_BIT == 0); \
3096 uint32_t fEfl = *pfEFlags; \
3097 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3098 uint32_t const fCarry = (uResult & X86_EFL_CF); \
3099 fEfl |= fCarry; \
3100 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3101 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
3102 else /* Intel 10980XE: According to the first sub-shift: */ \
3103 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3104 *pfEFlags = fEfl; \
3105 } \
3106}
3107
3108#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3109EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
3110#endif
3111EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
3112EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
3113
3114#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3115EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
3116#endif
3117EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
3118EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
3119
3120DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
3121{
3122 return (uValue << cShift) | (uValue >> (16 - cShift));
3123}
3124#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3125EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
3126#endif
3127EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
3128EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
3129
3130DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
3131{
3132 return (uValue << cShift) | (uValue >> (8 - cShift));
3133}
3134#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3135EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
3136#endif
3137EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
3138EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
3139
3140
3141/*
3142 * ROR
3143 */
3144#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
3145IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3146{ \
3147 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3148 if (cShift) \
3149 { \
3150 if (a_cBitsWidth < 32) \
3151 cShift &= a_cBitsWidth - 1; \
3152 a_uType const uDst = *puDst; \
3153 a_uType const uResult = a_fnHlp(uDst, cShift); \
3154 *puDst = uResult; \
3155 \
3156 /* Calc EFLAGS: */ \
3157 AssertCompile(X86_EFL_CF_BIT == 0); \
3158 uint32_t fEfl = *pfEFlags; \
3159 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3160 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
3161 fEfl |= fCarry; \
3162 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3163 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
3164 else /* Intel 10980XE: According to the first sub-shift: */ \
3165 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
3166 *pfEFlags = fEfl; \
3167 } \
3168}
3169
3170#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3171EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
3172#endif
3173EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
3174EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
3175
3176#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3177EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
3178#endif
3179EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
3180EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
3181
3182DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
3183{
3184 return (uValue >> cShift) | (uValue << (16 - cShift));
3185}
3186#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3187EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
3188#endif
3189EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
3190EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
3191
3192DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
3193{
3194 return (uValue >> cShift) | (uValue << (8 - cShift));
3195}
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
3198#endif
3199EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
3200EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
3201
3202
3203/*
3204 * RCL
3205 */
3206#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3211 cShift %= a_cBitsWidth + 1; \
3212 if (cShift) \
3213 { \
3214 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3215 cShift %= a_cBitsWidth + 1; \
3216 a_uType const uDst = *puDst; \
3217 a_uType uResult = uDst << cShift; \
3218 if (cShift > 1) \
3219 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
3220 \
3221 AssertCompile(X86_EFL_CF_BIT == 0); \
3222 uint32_t fEfl = *pfEFlags; \
3223 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3224 uResult |= (a_uType)fInCarry << (cShift - 1); \
3225 \
3226 *puDst = uResult; \
3227 \
3228 /* Calc EFLAGS. */ \
3229 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3230 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3231 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
3232 fEfl |= fOutCarry; \
3233 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
3234 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3235 else /* Intel 10980XE: According to the first sub-shift: */ \
3236 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3237 *pfEFlags = fEfl; \
3238 } \
3239}
3240
3241#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3242EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3243#endif
3244EMIT_RCL(64, uint64_t, _intel, 1)
3245EMIT_RCL(64, uint64_t, _amd, 0)
3246
3247#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3248EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3249#endif
3250EMIT_RCL(32, uint32_t, _intel, 1)
3251EMIT_RCL(32, uint32_t, _amd, 0)
3252
3253#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3254EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3255#endif
3256EMIT_RCL(16, uint16_t, _intel, 1)
3257EMIT_RCL(16, uint16_t, _amd, 0)
3258
3259#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3260EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3261#endif
3262EMIT_RCL(8, uint8_t, _intel, 1)
3263EMIT_RCL(8, uint8_t, _amd, 0)
3264
3265
3266/*
3267 * RCR
3268 */
3269#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3270IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3271{ \
3272 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3273 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3274 cShift %= a_cBitsWidth + 1; \
3275 if (cShift) \
3276 { \
3277 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3278 cShift %= a_cBitsWidth + 1; \
3279 a_uType const uDst = *puDst; \
3280 a_uType uResult = uDst >> cShift; \
3281 if (cShift > 1) \
3282 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3283 \
3284 AssertCompile(X86_EFL_CF_BIT == 0); \
3285 uint32_t fEfl = *pfEFlags; \
3286 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3287 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3288 *puDst = uResult; \
3289 \
3290 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3291 it the same way as for 1 bit shifts. */ \
3292 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3293 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3294 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3295 fEfl |= fOutCarry; \
3296 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3297 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3298 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3299 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3300 *pfEFlags = fEfl; \
3301 } \
3302}
3303
3304#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3305EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3306#endif
3307EMIT_RCR(64, uint64_t, _intel, 1)
3308EMIT_RCR(64, uint64_t, _amd, 0)
3309
3310#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3311EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3312#endif
3313EMIT_RCR(32, uint32_t, _intel, 1)
3314EMIT_RCR(32, uint32_t, _amd, 0)
3315
3316#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3317EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3318#endif
3319EMIT_RCR(16, uint16_t, _intel, 1)
3320EMIT_RCR(16, uint16_t, _amd, 0)
3321
3322#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3323EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3324#endif
3325EMIT_RCR(8, uint8_t, _intel, 1)
3326EMIT_RCR(8, uint8_t, _amd, 0)
3327
3328
3329/*
3330 * SHL
3331 */
3332#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3333IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3334{ \
3335 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3336 if (cShift) \
3337 { \
3338 a_uType const uDst = *puDst; \
3339 a_uType uResult = uDst << cShift; \
3340 *puDst = uResult; \
3341 \
3342 /* Calc EFLAGS. */ \
3343 AssertCompile(X86_EFL_CF_BIT == 0); \
3344 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3345 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3346 fEfl |= fCarry; \
3347 if (!a_fIntelFlags) \
3348 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3349 else \
3350 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3351 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3352 fEfl |= X86_EFL_CALC_ZF(uResult); \
3353 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3354 if (!a_fIntelFlags) \
3355 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3356 *pfEFlags = fEfl; \
3357 } \
3358}
3359
3360#if !defined(RT_ARCH_ARM64)
3361
3362# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3363EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3364# endif
3365EMIT_SHL(64, uint64_t, _intel, 1)
3366EMIT_SHL(64, uint64_t, _amd, 0)
3367
3368# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3369EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3370# endif
3371EMIT_SHL(32, uint32_t, _intel, 1)
3372EMIT_SHL(32, uint32_t, _amd, 0)
3373
3374# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3375EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3376# endif
3377EMIT_SHL(16, uint16_t, _intel, 1)
3378EMIT_SHL(16, uint16_t, _amd, 0)
3379
3380# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3381EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3382# endif
3383EMIT_SHL(8, uint8_t, _intel, 1)
3384EMIT_SHL(8, uint8_t, _amd, 0)
3385
3386#endif /* !RT_ARCH_ARM64 */
3387
3388
3389/*
3390 * SHR
3391 */
3392#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3393IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3394{ \
3395 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3396 if (cShift) \
3397 { \
3398 a_uType const uDst = *puDst; \
3399 a_uType uResult = uDst >> cShift; \
3400 *puDst = uResult; \
3401 \
3402 /* Calc EFLAGS. */ \
3403 AssertCompile(X86_EFL_CF_BIT == 0); \
3404 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3405 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3406 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3407 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3411 if (!a_fIntelFlags) \
3412 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3413 *pfEFlags = fEfl; \
3414 } \
3415}
3416
3417#if !defined(RT_ARCH_ARM64)
3418
3419# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3420EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3421# endif
3422EMIT_SHR(64, uint64_t, _intel, 1)
3423EMIT_SHR(64, uint64_t, _amd, 0)
3424
3425# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3426EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3427# endif
3428EMIT_SHR(32, uint32_t, _intel, 1)
3429EMIT_SHR(32, uint32_t, _amd, 0)
3430
3431# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3432EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3433# endif
3434EMIT_SHR(16, uint16_t, _intel, 1)
3435EMIT_SHR(16, uint16_t, _amd, 0)
3436
3437# if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3438EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3439# endif
3440EMIT_SHR(8, uint8_t, _intel, 1)
3441EMIT_SHR(8, uint8_t, _amd, 0)
3442
3443#endif /* !RT_ARCH_ARM64 */
3444
3445
3446/*
3447 * SAR
3448 */
3449#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3450IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3451{ \
3452 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3453 if (cShift) \
3454 { \
3455 a_iType const iDst = (a_iType)*puDst; \
3456 a_uType uResult = iDst >> cShift; \
3457 *puDst = uResult; \
3458 \
3459 /* Calc EFLAGS. \
3460 Note! The OF flag is always zero because the result never differs from the input. */ \
3461 AssertCompile(X86_EFL_CF_BIT == 0); \
3462 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3463 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3464 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3465 fEfl |= X86_EFL_CALC_ZF(uResult); \
3466 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3467 if (!a_fIntelFlags) \
3468 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3469 *pfEFlags = fEfl; \
3470 } \
3471}
3472
3473#if !defined(RT_ARCH_ARM64)
3474
3475# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3476EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3477# endif
3478EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3479EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3480
3481# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3482EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3483# endif
3484EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3485EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3486
3487# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3488EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3489# endif
3490EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3491EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3492
3493# if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3494EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3495# endif
3496EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3497EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3498
3499#endif /* !RT_ARCH_ARM64 */
3500
3501
3502/*
3503 * SHLD
3504 *
3505 * - CF is the last bit shifted out of puDst.
3506 * - AF is always cleared by Intel 10980XE.
3507 * - AF is always set by AMD 3990X.
3508 * - OF is set according to the first shift on Intel 10980XE, it seems.
3509 * - OF is set according to the last sub-shift on AMD 3990X.
3510 * - ZF, SF and PF are calculated according to the result by both vendors.
3511 *
3512 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3513 * pick either the source register or the destination register for input bits
3514 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3515 * intel has changed behaviour here several times. We implement what current
3516 * skylake based does for now, we can extend this later as needed.
3517 */
3518#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3519IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3520 uint32_t *pfEFlags)) \
3521{ \
3522 cShift &= a_cBitsWidth - 1; \
3523 if (cShift) \
3524 { \
3525 a_uType const uDst = *puDst; \
3526 a_uType uResult = uDst << cShift; \
3527 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3528 *puDst = uResult; \
3529 \
3530 /* CALC EFLAGS: */ \
3531 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3532 if (a_fIntelFlags) \
3533 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3534 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3535 else \
3536 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3537 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3538 fEfl |= X86_EFL_AF; \
3539 } \
3540 AssertCompile(X86_EFL_CF_BIT == 0); \
3541 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3542 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3543 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3544 fEfl |= X86_EFL_CALC_ZF(uResult); \
3545 *pfEFlags = fEfl; \
3546 } \
3547}
3548
3549#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3550EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3551#endif
3552EMIT_SHLD(64, uint64_t, _intel, 1)
3553EMIT_SHLD(64, uint64_t, _amd, 0)
3554
3555#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3556EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3557#endif
3558EMIT_SHLD(32, uint32_t, _intel, 1)
3559EMIT_SHLD(32, uint32_t, _amd, 0)
3560
3561#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3562IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3563{ \
3564 cShift &= 31; \
3565 if (cShift) \
3566 { \
3567 uint16_t const uDst = *puDst; \
3568 uint64_t const uTmp = a_fIntelFlags \
3569 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3570 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3571 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3572 *puDst = uResult; \
3573 \
3574 /* CALC EFLAGS: */ \
3575 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3576 AssertCompile(X86_EFL_CF_BIT == 0); \
3577 if (a_fIntelFlags) \
3578 { \
3579 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3580 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3581 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3582 } \
3583 else \
3584 { \
3585 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3586 if (cShift < 16) \
3587 { \
3588 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3589 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3590 } \
3591 else \
3592 { \
3593 if (cShift == 16) \
3594 fEfl |= uDst & X86_EFL_CF; \
3595 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3596 } \
3597 fEfl |= X86_EFL_AF; \
3598 } \
3599 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3600 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3601 fEfl |= X86_EFL_CALC_ZF(uResult); \
3602 *pfEFlags = fEfl; \
3603 } \
3604}
3605
3606#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3607EMIT_SHLD_16(RT_NOTHING, 1)
3608#endif
3609EMIT_SHLD_16(_intel, 1)
3610EMIT_SHLD_16(_amd, 0)
3611
3612
3613/*
3614 * SHRD
3615 *
3616 * EFLAGS behaviour seems to be the same as with SHLD:
3617 * - CF is the last bit shifted out of puDst.
3618 * - AF is always cleared by Intel 10980XE.
3619 * - AF is always set by AMD 3990X.
3620 * - OF is set according to the first shift on Intel 10980XE, it seems.
3621 * - OF is set according to the last sub-shift on AMD 3990X.
3622 * - ZF, SF and PF are calculated according to the result by both vendors.
3623 *
3624 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3625 * pick either the source register or the destination register for input bits
3626 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3627 * intel has changed behaviour here several times. We implement what current
3628 * skylake based does for now, we can extend this later as needed.
3629 */
3630#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3631IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3632{ \
3633 cShift &= a_cBitsWidth - 1; \
3634 if (cShift) \
3635 { \
3636 a_uType const uDst = *puDst; \
3637 a_uType uResult = uDst >> cShift; \
3638 uResult |= uSrc << (a_cBitsWidth - cShift); \
3639 *puDst = uResult; \
3640 \
3641 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3642 AssertCompile(X86_EFL_CF_BIT == 0); \
3643 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3644 if (a_fIntelFlags) \
3645 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3646 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3647 else \
3648 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3649 if (cShift > 1) /* Set according to last shift. */ \
3650 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3651 else \
3652 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3653 fEfl |= X86_EFL_AF; \
3654 } \
3655 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3656 fEfl |= X86_EFL_CALC_ZF(uResult); \
3657 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3658 *pfEFlags = fEfl; \
3659 } \
3660}
3661
3662#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3663EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3664#endif
3665EMIT_SHRD(64, uint64_t, _intel, 1)
3666EMIT_SHRD(64, uint64_t, _amd, 0)
3667
3668#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3669EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3670#endif
3671EMIT_SHRD(32, uint32_t, _intel, 1)
3672EMIT_SHRD(32, uint32_t, _amd, 0)
3673
3674#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3675IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3676{ \
3677 cShift &= 31; \
3678 if (cShift) \
3679 { \
3680 uint16_t const uDst = *puDst; \
3681 uint64_t const uTmp = a_fIntelFlags \
3682 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3683 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3684 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3685 *puDst = uResult; \
3686 \
3687 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3688 AssertCompile(X86_EFL_CF_BIT == 0); \
3689 if (a_fIntelFlags) \
3690 { \
3691 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3692 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3693 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3694 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3695 } \
3696 else \
3697 { \
3698 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3699 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3700 /* AMD 3990X: Set according to last shift. AF always set. */ \
3701 if (cShift > 1) /* Set according to last shift. */ \
3702 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3703 else \
3704 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3705 fEfl |= X86_EFL_AF; \
3706 } \
3707 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3708 fEfl |= X86_EFL_CALC_ZF(uResult); \
3709 fEfl |= IEM_EFL_CALC_PARITY(uResult); \
3710 *pfEFlags = fEfl; \
3711 } \
3712}
3713
3714#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3715EMIT_SHRD_16(RT_NOTHING, 1)
3716#endif
3717EMIT_SHRD_16(_intel, 1)
3718EMIT_SHRD_16(_amd, 0)
3719
3720
3721/*
3722 * RORX (BMI2)
3723 */
3724#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3725IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3726{ \
3727 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3728}
3729
3730#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3731EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3732#endif
3733#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3734EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3735#endif
3736
3737
3738/*
3739 * SHLX (BMI2)
3740 */
3741#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3743{ \
3744 cShift &= a_cBitsWidth - 1; \
3745 *puDst = uSrc << cShift; \
3746}
3747
3748#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3749EMIT_SHLX(64, uint64_t, RT_NOTHING)
3750EMIT_SHLX(64, uint64_t, _fallback)
3751#endif
3752#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3753EMIT_SHLX(32, uint32_t, RT_NOTHING)
3754EMIT_SHLX(32, uint32_t, _fallback)
3755#endif
3756
3757
3758/*
3759 * SHRX (BMI2)
3760 */
3761#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3762IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3763{ \
3764 cShift &= a_cBitsWidth - 1; \
3765 *puDst = uSrc >> cShift; \
3766}
3767
3768#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3769EMIT_SHRX(64, uint64_t, RT_NOTHING)
3770EMIT_SHRX(64, uint64_t, _fallback)
3771#endif
3772#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3773EMIT_SHRX(32, uint32_t, RT_NOTHING)
3774EMIT_SHRX(32, uint32_t, _fallback)
3775#endif
3776
3777
3778/*
3779 * SARX (BMI2)
3780 */
3781#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3782IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3783{ \
3784 cShift &= a_cBitsWidth - 1; \
3785 *puDst = (a_iType)uSrc >> cShift; \
3786}
3787
3788#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3789EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3790EMIT_SARX(64, uint64_t, int64_t, _fallback)
3791#endif
3792#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3793EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3794EMIT_SARX(32, uint32_t, int32_t, _fallback)
3795#endif
3796
3797
3798/*
3799 * PDEP (BMI2)
3800 */
3801#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3802IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3803{ \
3804 a_uType uResult = 0; \
3805 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3806 if (fMask & ((a_uType)1 << iMaskBit)) \
3807 { \
3808 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3809 iBit++; \
3810 } \
3811 *puDst = uResult; \
3812}
3813
3814#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3815EMIT_PDEP(64, uint64_t, RT_NOTHING)
3816#endif
3817EMIT_PDEP(64, uint64_t, _fallback)
3818#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3819EMIT_PDEP(32, uint32_t, RT_NOTHING)
3820#endif
3821EMIT_PDEP(32, uint32_t, _fallback)
3822
3823/*
3824 * PEXT (BMI2)
3825 */
3826#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3827IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3828{ \
3829 a_uType uResult = 0; \
3830 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3831 if (fMask & ((a_uType)1 << iMaskBit)) \
3832 { \
3833 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3834 iBit++; \
3835 } \
3836 *puDst = uResult; \
3837}
3838
3839#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3840EMIT_PEXT(64, uint64_t, RT_NOTHING)
3841#endif
3842EMIT_PEXT(64, uint64_t, _fallback)
3843#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3844EMIT_PEXT(32, uint32_t, RT_NOTHING)
3845#endif
3846EMIT_PEXT(32, uint32_t, _fallback)
3847
3848
3849#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3850
3851# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3852/*
3853 * BSWAP
3854 */
3855
3856IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3857{
3858 *puDst = ASMByteSwapU64(*puDst);
3859}
3860
3861
3862IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3863{
3864 *puDst = ASMByteSwapU32(*puDst);
3865}
3866
3867
3868/* Note! undocument, so 32-bit arg */
3869IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3870{
3871#if 0
3872 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3873#else
3874 /* This is the behaviour AMD 3990x (64-bit mode): */
3875 *(uint16_t *)puDst = 0;
3876#endif
3877}
3878
3879# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3880
3881
3882
3883# if defined(IEM_WITHOUT_ASSEMBLY)
3884
3885/*
3886 * LFENCE, SFENCE & MFENCE.
3887 */
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3890{
3891 ASMReadFence();
3892}
3893
3894
3895IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3896{
3897 ASMWriteFence();
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3902{
3903 ASMMemoryFence();
3904}
3905
3906
3907# ifndef RT_ARCH_ARM64
3908IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3909{
3910 ASMMemoryFence();
3911}
3912# endif
3913
3914# endif
3915
3916#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3917
3918
3919IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3920{
3921 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3922 {
3923 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3924 *pu16Dst |= u16Src & X86_SEL_RPL;
3925
3926 *pfEFlags |= X86_EFL_ZF;
3927 }
3928 else
3929 *pfEFlags &= ~X86_EFL_ZF;
3930}
3931
3932
3933#if defined(IEM_WITHOUT_ASSEMBLY)
3934
3935/*********************************************************************************************************************************
3936* x87 FPU Loads *
3937*********************************************************************************************************************************/
3938
3939IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3940{
3941 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3942 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3943 {
3944 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3945 pFpuRes->r80Result.sj64.fInteger = 1;
3946 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3947 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3948 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3949 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3950 }
3951 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3952 {
3953 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3954 pFpuRes->r80Result.s.uExponent = 0;
3955 pFpuRes->r80Result.s.uMantissa = 0;
3956 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3957 }
3958 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3959 {
3960 /* Subnormal values gets normalized. */
3961 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3962 pFpuRes->r80Result.sj64.fInteger = 1;
3963 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3964 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3965 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3966 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3967 pFpuRes->FSW |= X86_FSW_DE;
3968 if (!(pFpuState->FCW & X86_FCW_DM))
3969 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3970 }
3971 else if (RTFLOAT32U_IS_INF(pr32Val))
3972 {
3973 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3974 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3975 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3976 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3977 }
3978 else
3979 {
3980 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3981 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3982 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3983 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3984 pFpuRes->r80Result.sj64.fInteger = 1;
3985 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3986 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3987 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3988 {
3989 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3990 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3991 pFpuRes->FSW |= X86_FSW_IE;
3992
3993 if (!(pFpuState->FCW & X86_FCW_IM))
3994 {
3995 /* The value is not pushed. */
3996 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3997 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3998 pFpuRes->r80Result.au64[0] = 0;
3999 pFpuRes->r80Result.au16[4] = 0;
4000 }
4001 }
4002 else
4003 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4004 }
4005}
4006
4007
4008IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
4009{
4010 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4011 if (RTFLOAT64U_IS_NORMAL(pr64Val))
4012 {
4013 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4014 pFpuRes->r80Result.sj64.fInteger = 1;
4015 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4016 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
4017 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
4018 }
4019 else if (RTFLOAT64U_IS_ZERO(pr64Val))
4020 {
4021 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4022 pFpuRes->r80Result.s.uExponent = 0;
4023 pFpuRes->r80Result.s.uMantissa = 0;
4024 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
4025 }
4026 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
4027 {
4028 /* Subnormal values gets normalized. */
4029 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4030 pFpuRes->r80Result.sj64.fInteger = 1;
4031 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
4032 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
4033 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
4034 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
4035 pFpuRes->FSW |= X86_FSW_DE;
4036 if (!(pFpuState->FCW & X86_FCW_DM))
4037 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
4038 }
4039 else if (RTFLOAT64U_IS_INF(pr64Val))
4040 {
4041 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
4042 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
4043 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
4044 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
4045 }
4046 else
4047 {
4048 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
4049 Assert(RTFLOAT64U_IS_NAN(pr64Val));
4050 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
4051 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
4052 pFpuRes->r80Result.sj64.fInteger = 1;
4053 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4054 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
4055 {
4056 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
4057 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4058 pFpuRes->FSW |= X86_FSW_IE;
4059
4060 if (!(pFpuState->FCW & X86_FCW_IM))
4061 {
4062 /* The value is not pushed. */
4063 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
4064 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
4065 pFpuRes->r80Result.au64[0] = 0;
4066 pFpuRes->r80Result.au16[4] = 0;
4067 }
4068 }
4069 else
4070 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
4071 }
4072}
4073
4074
4075IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
4076{
4077 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
4078 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
4079 /* Raises no exceptions. */
4080 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4081}
4082
4083
4084IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4085{
4086 pFpuRes->r80Result.sj64.fSign = 0;
4087 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4088 pFpuRes->r80Result.sj64.fInteger = 1;
4089 pFpuRes->r80Result.sj64.uFraction = 0;
4090
4091 /*
4092 * FPU status word:
4093 * - TOP is irrelevant, but we must match x86 assembly version.
4094 * - C1 is always cleared as we don't have any stack overflows.
4095 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4096 */
4097 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4098}
4099
4100
4101IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4102{
4103 pFpuRes->r80Result.sj64.fSign = 0;
4104 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
4105 pFpuRes->r80Result.sj64.fInteger = 1;
4106 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4107 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4108 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
4109 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4110}
4111
4112
4113IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4114{
4115 pFpuRes->r80Result.sj64.fSign = 0;
4116 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4117 pFpuRes->r80Result.sj64.fInteger = 1;
4118 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
4119 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
4120 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4121}
4122
4123
4124IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4125{
4126 pFpuRes->r80Result.sj64.fSign = 0;
4127 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
4128 pFpuRes->r80Result.sj64.fInteger = 1;
4129 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4130 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4131 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
4132 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4133}
4134
4135
4136IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4137{
4138 pFpuRes->r80Result.sj64.fSign = 0;
4139 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
4140 pFpuRes->r80Result.sj64.fInteger = 1;
4141 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4142 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4143 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
4144 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4145}
4146
4147
4148IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4149{
4150 pFpuRes->r80Result.sj64.fSign = 0;
4151 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
4152 pFpuRes->r80Result.sj64.fInteger = 1;
4153 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4154 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4155 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
4156 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4157}
4158
4159
4160IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
4161{
4162 pFpuRes->r80Result.s.fSign = 0;
4163 pFpuRes->r80Result.s.uExponent = 0;
4164 pFpuRes->r80Result.s.uMantissa = 0;
4165 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4166}
4167
4168#define EMIT_FILD(a_cBits) \
4169IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
4170 int ## a_cBits ## _t const *piVal)) \
4171{ \
4172 int ## a_cBits ## _t iVal = *piVal; \
4173 if (iVal == 0) \
4174 { \
4175 pFpuRes->r80Result.s.fSign = 0; \
4176 pFpuRes->r80Result.s.uExponent = 0; \
4177 pFpuRes->r80Result.s.uMantissa = 0; \
4178 } \
4179 else \
4180 { \
4181 if (iVal > 0) \
4182 pFpuRes->r80Result.s.fSign = 0; \
4183 else \
4184 { \
4185 pFpuRes->r80Result.s.fSign = 1; \
4186 iVal = -iVal; \
4187 } \
4188 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
4189 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
4190 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
4191 } \
4192 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
4193}
4194EMIT_FILD(16)
4195EMIT_FILD(32)
4196EMIT_FILD(64)
4197
4198
4199IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
4200{
4201 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
4202 if ( pd80Val->s.abPairs[0] == 0
4203 && pd80Val->s.abPairs[1] == 0
4204 && pd80Val->s.abPairs[2] == 0
4205 && pd80Val->s.abPairs[3] == 0
4206 && pd80Val->s.abPairs[4] == 0
4207 && pd80Val->s.abPairs[5] == 0
4208 && pd80Val->s.abPairs[6] == 0
4209 && pd80Val->s.abPairs[7] == 0
4210 && pd80Val->s.abPairs[8] == 0)
4211 {
4212 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4213 pFpuRes->r80Result.s.uExponent = 0;
4214 pFpuRes->r80Result.s.uMantissa = 0;
4215 }
4216 else
4217 {
4218 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
4219
4220 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
4221 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
4222 cPairs--;
4223
4224 uint64_t uVal = 0;
4225 uint64_t uFactor = 1;
4226 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
4227 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
4228 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
4229
4230 unsigned const cBits = ASMBitLastSetU64(uVal);
4231 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
4232 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
4233 }
4234}
4235
4236
4237/*********************************************************************************************************************************
4238* x87 FPU Stores *
4239*********************************************************************************************************************************/
4240
4241/**
4242 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4243 *
4244 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4245 *
4246 * @returns Updated FPU status word value.
4247 * @param fSignIn Incoming sign indicator.
4248 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4249 * @param iExponentIn Unbiased exponent.
4250 * @param fFcw The FPU control word.
4251 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4252 * @param pr32Dst Where to return the output value, if one should be
4253 * returned.
4254 *
4255 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4256 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4257 */
4258static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4259 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4260{
4261 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4262 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4263 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4264 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4265 ? fRoundingOffMask
4266 : 0;
4267 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4268
4269 /*
4270 * Deal with potential overflows/underflows first, optimizing for none.
4271 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4272 */
4273 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4274 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4275 { /* likely? */ }
4276 /*
4277 * Underflow if the exponent zero or negative. This is attempted mapped
4278 * to a subnormal number when possible, with some additional trickery ofc.
4279 */
4280 else if (iExponentOut <= 0)
4281 {
4282 bool const fIsTiny = iExponentOut < 0
4283 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4284 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4285 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4286 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4287
4288 if (iExponentOut <= 0)
4289 {
4290 uMantissaIn = iExponentOut <= -63
4291 ? uMantissaIn != 0
4292 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4293 fRoundedOff = uMantissaIn & fRoundingOffMask;
4294 if (fRoundedOff && fIsTiny)
4295 fFsw |= X86_FSW_UE;
4296 iExponentOut = 0;
4297 }
4298 }
4299 /*
4300 * Overflow if at or above max exponent value or if we will reach max
4301 * when rounding. Will return +/-zero or +/-max value depending on
4302 * whether we're rounding or not.
4303 */
4304 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4305 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4306 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4307 {
4308 fFsw |= X86_FSW_OE;
4309 if (!(fFcw & X86_FCW_OM))
4310 return fFsw | X86_FSW_ES | X86_FSW_B;
4311 fFsw |= X86_FSW_PE;
4312 if (uRoundingAdd)
4313 fFsw |= X86_FSW_C1;
4314 if (!(fFcw & X86_FCW_PM))
4315 fFsw |= X86_FSW_ES | X86_FSW_B;
4316
4317 pr32Dst->s.fSign = fSignIn;
4318 if (uRoundingAdd)
4319 { /* Zero */
4320 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4321 pr32Dst->s.uFraction = 0;
4322 }
4323 else
4324 { /* Max */
4325 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4326 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4327 }
4328 return fFsw;
4329 }
4330
4331 /*
4332 * Normal or subnormal number.
4333 */
4334 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4335 uint64_t uMantissaOut = uMantissaIn;
4336 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4337 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4338 || fRoundedOff != uRoundingAdd)
4339 {
4340 uMantissaOut = uMantissaIn + uRoundingAdd;
4341 if (uMantissaOut >= uMantissaIn)
4342 { /* likely */ }
4343 else
4344 {
4345 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4346 iExponentOut++;
4347 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4348 fFsw |= X86_FSW_C1;
4349 }
4350 }
4351 else
4352 uMantissaOut = uMantissaIn;
4353
4354 /* Truncate the mantissa and set the return value. */
4355 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4356
4357 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4358 pr32Dst->s.uExponent = iExponentOut;
4359 pr32Dst->s.fSign = fSignIn;
4360
4361 /* Set status flags realted to rounding. */
4362 if (fRoundedOff)
4363 {
4364 fFsw |= X86_FSW_PE;
4365 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4366 fFsw |= X86_FSW_C1;
4367 if (!(fFcw & X86_FCW_PM))
4368 fFsw |= X86_FSW_ES | X86_FSW_B;
4369 }
4370
4371 return fFsw;
4372}
4373
4374
4375/**
4376 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4377 */
4378IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4379 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4380{
4381 uint16_t const fFcw = pFpuState->FCW;
4382 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4383 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4384 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4385 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4386 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4387 {
4388 pr32Dst->s.fSign = pr80Src->s.fSign;
4389 pr32Dst->s.uExponent = 0;
4390 pr32Dst->s.uFraction = 0;
4391 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4392 }
4393 else if (RTFLOAT80U_IS_INF(pr80Src))
4394 {
4395 pr32Dst->s.fSign = pr80Src->s.fSign;
4396 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4397 pr32Dst->s.uFraction = 0;
4398 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4399 }
4400 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4401 {
4402 /* Mapped to +/-QNaN */
4403 pr32Dst->s.fSign = pr80Src->s.fSign;
4404 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4405 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4406 }
4407 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4408 {
4409 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4410 if (fFcw & X86_FCW_IM)
4411 {
4412 pr32Dst->s.fSign = 1;
4413 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4414 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4415 fFsw |= X86_FSW_IE;
4416 }
4417 else
4418 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4419 }
4420 else if (RTFLOAT80U_IS_NAN(pr80Src))
4421 {
4422 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4423 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4424 {
4425 pr32Dst->s.fSign = pr80Src->s.fSign;
4426 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4427 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4428 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4429 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4430 fFsw |= X86_FSW_IE;
4431 }
4432 else
4433 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4434 }
4435 else
4436 {
4437 /* Denormal values causes both an underflow and precision exception. */
4438 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4439 if (fFcw & X86_FCW_UM)
4440 {
4441 pr32Dst->s.fSign = pr80Src->s.fSign;
4442 pr32Dst->s.uExponent = 0;
4443 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4444 {
4445 pr32Dst->s.uFraction = 1;
4446 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4447 if (!(fFcw & X86_FCW_PM))
4448 fFsw |= X86_FSW_ES | X86_FSW_B;
4449 }
4450 else
4451 {
4452 pr32Dst->s.uFraction = 0;
4453 fFsw |= X86_FSW_UE | X86_FSW_PE;
4454 if (!(fFcw & X86_FCW_PM))
4455 fFsw |= X86_FSW_ES | X86_FSW_B;
4456 }
4457 }
4458 else
4459 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4460 }
4461 *pu16FSW = fFsw;
4462}
4463
4464
4465/**
4466 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4467 *
4468 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4469 *
4470 * @returns Updated FPU status word value.
4471 * @param fSignIn Incoming sign indicator.
4472 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4473 * @param iExponentIn Unbiased exponent.
4474 * @param fFcw The FPU control word.
4475 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4476 * @param pr64Dst Where to return the output value, if one should be
4477 * returned.
4478 *
4479 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4480 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4481 */
4482static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4483 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4484{
4485 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4486 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4487 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4488 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4489 ? fRoundingOffMask
4490 : 0;
4491 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4492
4493 /*
4494 * Deal with potential overflows/underflows first, optimizing for none.
4495 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4496 */
4497 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4498 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4499 { /* likely? */ }
4500 /*
4501 * Underflow if the exponent zero or negative. This is attempted mapped
4502 * to a subnormal number when possible, with some additional trickery ofc.
4503 */
4504 else if (iExponentOut <= 0)
4505 {
4506 bool const fIsTiny = iExponentOut < 0
4507 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4508 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4509 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4510 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4511
4512 if (iExponentOut <= 0)
4513 {
4514 uMantissaIn = iExponentOut <= -63
4515 ? uMantissaIn != 0
4516 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4517 fRoundedOff = uMantissaIn & fRoundingOffMask;
4518 if (fRoundedOff && fIsTiny)
4519 fFsw |= X86_FSW_UE;
4520 iExponentOut = 0;
4521 }
4522 }
4523 /*
4524 * Overflow if at or above max exponent value or if we will reach max
4525 * when rounding. Will return +/-zero or +/-max value depending on
4526 * whether we're rounding or not.
4527 */
4528 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4529 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4530 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4531 {
4532 fFsw |= X86_FSW_OE;
4533 if (!(fFcw & X86_FCW_OM))
4534 return fFsw | X86_FSW_ES | X86_FSW_B;
4535 fFsw |= X86_FSW_PE;
4536 if (uRoundingAdd)
4537 fFsw |= X86_FSW_C1;
4538 if (!(fFcw & X86_FCW_PM))
4539 fFsw |= X86_FSW_ES | X86_FSW_B;
4540
4541 pr64Dst->s64.fSign = fSignIn;
4542 if (uRoundingAdd)
4543 { /* Zero */
4544 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4545 pr64Dst->s64.uFraction = 0;
4546 }
4547 else
4548 { /* Max */
4549 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4550 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4551 }
4552 return fFsw;
4553 }
4554
4555 /*
4556 * Normal or subnormal number.
4557 */
4558 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4559 uint64_t uMantissaOut = uMantissaIn;
4560 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4561 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4562 || fRoundedOff != uRoundingAdd)
4563 {
4564 uMantissaOut = uMantissaIn + uRoundingAdd;
4565 if (uMantissaOut >= uMantissaIn)
4566 { /* likely */ }
4567 else
4568 {
4569 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4570 iExponentOut++;
4571 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4572 fFsw |= X86_FSW_C1;
4573 }
4574 }
4575 else
4576 uMantissaOut = uMantissaIn;
4577
4578 /* Truncate the mantissa and set the return value. */
4579 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4580
4581 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4582 pr64Dst->s64.uExponent = iExponentOut;
4583 pr64Dst->s64.fSign = fSignIn;
4584
4585 /* Set status flags realted to rounding. */
4586 if (fRoundedOff)
4587 {
4588 fFsw |= X86_FSW_PE;
4589 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4590 fFsw |= X86_FSW_C1;
4591 if (!(fFcw & X86_FCW_PM))
4592 fFsw |= X86_FSW_ES | X86_FSW_B;
4593 }
4594
4595 return fFsw;
4596}
4597
4598
4599/**
4600 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4601 */
4602IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4603 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4604{
4605 uint16_t const fFcw = pFpuState->FCW;
4606 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4607 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4608 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4609 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4610 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4611 {
4612 pr64Dst->s64.fSign = pr80Src->s.fSign;
4613 pr64Dst->s64.uExponent = 0;
4614 pr64Dst->s64.uFraction = 0;
4615 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4616 }
4617 else if (RTFLOAT80U_IS_INF(pr80Src))
4618 {
4619 pr64Dst->s64.fSign = pr80Src->s.fSign;
4620 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4621 pr64Dst->s64.uFraction = 0;
4622 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4623 }
4624 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4625 {
4626 /* Mapped to +/-QNaN */
4627 pr64Dst->s64.fSign = pr80Src->s.fSign;
4628 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4629 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4630 }
4631 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4632 {
4633 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4634 if (fFcw & X86_FCW_IM)
4635 {
4636 pr64Dst->s64.fSign = 1;
4637 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4638 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4639 fFsw |= X86_FSW_IE;
4640 }
4641 else
4642 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4643 }
4644 else if (RTFLOAT80U_IS_NAN(pr80Src))
4645 {
4646 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4647 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4648 {
4649 pr64Dst->s64.fSign = pr80Src->s.fSign;
4650 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4651 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4652 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4653 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4654 fFsw |= X86_FSW_IE;
4655 }
4656 else
4657 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4658 }
4659 else
4660 {
4661 /* Denormal values causes both an underflow and precision exception. */
4662 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4663 if (fFcw & X86_FCW_UM)
4664 {
4665 pr64Dst->s64.fSign = pr80Src->s.fSign;
4666 pr64Dst->s64.uExponent = 0;
4667 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4668 {
4669 pr64Dst->s64.uFraction = 1;
4670 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4671 if (!(fFcw & X86_FCW_PM))
4672 fFsw |= X86_FSW_ES | X86_FSW_B;
4673 }
4674 else
4675 {
4676 pr64Dst->s64.uFraction = 0;
4677 fFsw |= X86_FSW_UE | X86_FSW_PE;
4678 if (!(fFcw & X86_FCW_PM))
4679 fFsw |= X86_FSW_ES | X86_FSW_B;
4680 }
4681 }
4682 else
4683 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4684 }
4685 *pu16FSW = fFsw;
4686}
4687
4688
4689IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4690 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4691{
4692 /*
4693 * FPU status word:
4694 * - TOP is irrelevant, but we must match x86 assembly version (0).
4695 * - C1 is always cleared as we don't have any stack overflows.
4696 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4697 */
4698 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4699 *pr80Dst = *pr80Src;
4700}
4701
4702
4703/*
4704 *
4705 * Mantissa:
4706 * 63 56 48 40 32 24 16 8 0
4707 * v v v v v v v v v
4708 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4709 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4710 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4711 *
4712 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4713 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4714 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4715 * where we'll drop off all but bit 63.
4716 */
4717#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4718IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4719 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4720{ \
4721 uint16_t const fFcw = pFpuState->FCW; \
4722 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4723 bool const fSignIn = pr80Val->s.fSign; \
4724 \
4725 /* \
4726 * Deal with normal numbers first. \
4727 */ \
4728 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4729 { \
4730 uint64_t uMantissa = pr80Val->s.uMantissa; \
4731 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4732 \
4733 if ((uint32_t)iExponent <= a_cBits - 2) \
4734 { \
4735 unsigned const cShiftOff = 63 - iExponent; \
4736 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4737 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4738 ? RT_BIT_64(cShiftOff - 1) \
4739 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4740 ? fRoundingOffMask \
4741 : 0; \
4742 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4743 \
4744 uMantissa >>= cShiftOff; \
4745 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4746 uMantissa += uRounding; \
4747 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4748 { \
4749 if (fRoundedOff) \
4750 { \
4751 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4752 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4753 else if (uRounding) \
4754 fFsw |= X86_FSW_C1; \
4755 fFsw |= X86_FSW_PE; \
4756 if (!(fFcw & X86_FCW_PM)) \
4757 fFsw |= X86_FSW_ES | X86_FSW_B; \
4758 } \
4759 \
4760 if (!fSignIn) \
4761 *piDst = (a_iType)uMantissa; \
4762 else \
4763 *piDst = -(a_iType)uMantissa; \
4764 } \
4765 else \
4766 { \
4767 /* overflowed after rounding. */ \
4768 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4769 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4770 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4771 \
4772 /* Special case for the integer minimum value. */ \
4773 if (fSignIn) \
4774 { \
4775 *piDst = a_iTypeMin; \
4776 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4777 if (!(fFcw & X86_FCW_PM)) \
4778 fFsw |= X86_FSW_ES | X86_FSW_B; \
4779 } \
4780 else \
4781 { \
4782 fFsw |= X86_FSW_IE; \
4783 if (fFcw & X86_FCW_IM) \
4784 *piDst = a_iTypeMin; \
4785 else \
4786 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4787 } \
4788 } \
4789 } \
4790 /* \
4791 * Tiny sub-zero numbers. \
4792 */ \
4793 else if (iExponent < 0) \
4794 { \
4795 if (!fSignIn) \
4796 { \
4797 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4798 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4799 { \
4800 *piDst = 1; \
4801 fFsw |= X86_FSW_C1; \
4802 } \
4803 else \
4804 *piDst = 0; \
4805 } \
4806 else \
4807 { \
4808 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4809 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4810 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4811 *piDst = 0; \
4812 else \
4813 { \
4814 *piDst = -1; \
4815 fFsw |= X86_FSW_C1; \
4816 } \
4817 } \
4818 fFsw |= X86_FSW_PE; \
4819 if (!(fFcw & X86_FCW_PM)) \
4820 fFsw |= X86_FSW_ES | X86_FSW_B; \
4821 } \
4822 /* \
4823 * Special MIN case. \
4824 */ \
4825 else if ( fSignIn && iExponent == a_cBits - 1 \
4826 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4827 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4828 : uMantissa == RT_BIT_64(63))) \
4829 { \
4830 *piDst = a_iTypeMin; \
4831 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4832 { \
4833 fFsw |= X86_FSW_PE; \
4834 if (!(fFcw & X86_FCW_PM)) \
4835 fFsw |= X86_FSW_ES | X86_FSW_B; \
4836 } \
4837 } \
4838 /* \
4839 * Too large/small number outside the target integer range. \
4840 */ \
4841 else \
4842 { \
4843 fFsw |= X86_FSW_IE; \
4844 if (fFcw & X86_FCW_IM) \
4845 *piDst = a_iTypeIndefinite; \
4846 else \
4847 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4848 } \
4849 } \
4850 /* \
4851 * Map both +0 and -0 to integer zero (signless/+). \
4852 */ \
4853 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4854 *piDst = 0; \
4855 /* \
4856 * Denormals are just really tiny sub-zero numbers that are either rounded \
4857 * to zero, 1 or -1 depending on sign and rounding control. \
4858 */ \
4859 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4860 { \
4861 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4862 *piDst = 0; \
4863 else \
4864 { \
4865 *piDst = fSignIn ? -1 : 1; \
4866 fFsw |= X86_FSW_C1; \
4867 } \
4868 fFsw |= X86_FSW_PE; \
4869 if (!(fFcw & X86_FCW_PM)) \
4870 fFsw |= X86_FSW_ES | X86_FSW_B; \
4871 } \
4872 /* \
4873 * All other special values are considered invalid arguments and result \
4874 * in an IE exception and indefinite value if masked. \
4875 */ \
4876 else \
4877 { \
4878 fFsw |= X86_FSW_IE; \
4879 if (fFcw & X86_FCW_IM) \
4880 *piDst = a_iTypeIndefinite; \
4881 else \
4882 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4883 } \
4884 *pu16FSW = fFsw; \
4885}
4886EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4887EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4888EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4889
4890#endif /*IEM_WITHOUT_ASSEMBLY */
4891
4892
4893/*
4894 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4895 *
4896 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4897 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4898 * thus the @a a_cBitsIn.
4899 */
4900#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4901IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4902 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4903{ \
4904 uint16_t const fFcw = pFpuState->FCW; \
4905 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4906 bool const fSignIn = pr80Val->s.fSign; \
4907 \
4908 /* \
4909 * Deal with normal numbers first. \
4910 */ \
4911 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4912 { \
4913 uint64_t uMantissa = pr80Val->s.uMantissa; \
4914 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4915 \
4916 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4917 { \
4918 unsigned const cShiftOff = 63 - iExponent; \
4919 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4920 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4921 uMantissa >>= cShiftOff; \
4922 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4923 if (!fSignIn) \
4924 *piDst = (a_iType)uMantissa; \
4925 else \
4926 *piDst = -(a_iType)uMantissa; \
4927 \
4928 if (fRoundedOff) \
4929 { \
4930 fFsw |= X86_FSW_PE; \
4931 if (!(fFcw & X86_FCW_PM)) \
4932 fFsw |= X86_FSW_ES | X86_FSW_B; \
4933 } \
4934 } \
4935 /* \
4936 * Tiny sub-zero numbers. \
4937 */ \
4938 else if (iExponent < 0) \
4939 { \
4940 *piDst = 0; \
4941 fFsw |= X86_FSW_PE; \
4942 if (!(fFcw & X86_FCW_PM)) \
4943 fFsw |= X86_FSW_ES | X86_FSW_B; \
4944 } \
4945 /* \
4946 * Special MIN case. \
4947 */ \
4948 else if ( fSignIn && iExponent == a_cBits - 1 \
4949 && (a_cBits < 64 \
4950 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4951 : uMantissa == RT_BIT_64(63)) ) \
4952 { \
4953 *piDst = a_iTypeMin; \
4954 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4955 { \
4956 fFsw |= X86_FSW_PE; \
4957 if (!(fFcw & X86_FCW_PM)) \
4958 fFsw |= X86_FSW_ES | X86_FSW_B; \
4959 } \
4960 } \
4961 /* \
4962 * Figure this weirdness. \
4963 */ \
4964 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4965 { \
4966 *piDst = 0; \
4967 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4968 { \
4969 fFsw |= X86_FSW_PE; \
4970 if (!(fFcw & X86_FCW_PM)) \
4971 fFsw |= X86_FSW_ES | X86_FSW_B; \
4972 } \
4973 } \
4974 /* \
4975 * Too large/small number outside the target integer range. \
4976 */ \
4977 else \
4978 { \
4979 fFsw |= X86_FSW_IE; \
4980 if (fFcw & X86_FCW_IM) \
4981 *piDst = a_iTypeIndefinite; \
4982 else \
4983 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4984 } \
4985 } \
4986 /* \
4987 * Map both +0 and -0 to integer zero (signless/+). \
4988 */ \
4989 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4990 *piDst = 0; \
4991 /* \
4992 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4993 */ \
4994 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4995 { \
4996 *piDst = 0; \
4997 fFsw |= X86_FSW_PE; \
4998 if (!(fFcw & X86_FCW_PM)) \
4999 fFsw |= X86_FSW_ES | X86_FSW_B; \
5000 } \
5001 /* \
5002 * All other special values are considered invalid arguments and result \
5003 * in an IE exception and indefinite value if masked. \
5004 */ \
5005 else \
5006 { \
5007 fFsw |= X86_FSW_IE; \
5008 if (fFcw & X86_FCW_IM) \
5009 *piDst = a_iTypeIndefinite; \
5010 else \
5011 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
5012 } \
5013 *pu16FSW = fFsw; \
5014}
5015#if defined(IEM_WITHOUT_ASSEMBLY)
5016EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
5017EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
5018EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
5019#endif
5020EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
5021EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
5022
5023
5024#if defined(IEM_WITHOUT_ASSEMBLY)
5025
5026IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
5027 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
5028{
5029 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
5030 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
5031 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
5032 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
5033 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
5034
5035 uint16_t const fFcw = pFpuState->FCW;
5036 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
5037 bool const fSignIn = pr80Src->s.fSign;
5038
5039 /*
5040 * Deal with normal numbers first.
5041 */
5042 if (RTFLOAT80U_IS_NORMAL(pr80Src))
5043 {
5044 uint64_t uMantissa = pr80Src->s.uMantissa;
5045 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
5046 if ( (uint32_t)iExponent <= 58
5047 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
5048 {
5049 unsigned const cShiftOff = 63 - iExponent;
5050 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5051 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5052 ? RT_BIT_64(cShiftOff - 1)
5053 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5054 ? fRoundingOffMask
5055 : 0;
5056 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
5057
5058 uMantissa >>= cShiftOff;
5059 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
5060 uMantissa += uRounding;
5061 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
5062 {
5063 if (fRoundedOff)
5064 {
5065 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
5066 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
5067 else if (uRounding)
5068 fFsw |= X86_FSW_C1;
5069 fFsw |= X86_FSW_PE;
5070 if (!(fFcw & X86_FCW_PM))
5071 fFsw |= X86_FSW_ES | X86_FSW_B;
5072 }
5073
5074 pd80Dst->s.fSign = fSignIn;
5075 pd80Dst->s.uPad = 0;
5076 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
5077 {
5078 unsigned const uDigits = uMantissa % 100;
5079 uMantissa /= 100;
5080 uint8_t const bLo = uDigits % 10;
5081 uint8_t const bHi = uDigits / 10;
5082 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
5083 }
5084 }
5085 else
5086 {
5087 /* overflowed after rounding. */
5088 fFsw |= X86_FSW_IE;
5089 if (fFcw & X86_FCW_IM)
5090 *pd80Dst = s_d80Indefinite;
5091 else
5092 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5093 }
5094 }
5095 /*
5096 * Tiny sub-zero numbers.
5097 */
5098 else if (iExponent < 0)
5099 {
5100 if (!fSignIn)
5101 {
5102 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5103 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5104 {
5105 *pd80Dst = s_ad80One[fSignIn];
5106 fFsw |= X86_FSW_C1;
5107 }
5108 else
5109 *pd80Dst = s_ad80Zeros[fSignIn];
5110 }
5111 else
5112 {
5113 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
5114 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
5115 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
5116 *pd80Dst = s_ad80Zeros[fSignIn];
5117 else
5118 {
5119 *pd80Dst = s_ad80One[fSignIn];
5120 fFsw |= X86_FSW_C1;
5121 }
5122 }
5123 fFsw |= X86_FSW_PE;
5124 if (!(fFcw & X86_FCW_PM))
5125 fFsw |= X86_FSW_ES | X86_FSW_B;
5126 }
5127 /*
5128 * Too large/small number outside the target integer range.
5129 */
5130 else
5131 {
5132 fFsw |= X86_FSW_IE;
5133 if (fFcw & X86_FCW_IM)
5134 *pd80Dst = s_d80Indefinite;
5135 else
5136 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5137 }
5138 }
5139 /*
5140 * Map both +0 and -0 to integer zero (signless/+).
5141 */
5142 else if (RTFLOAT80U_IS_ZERO(pr80Src))
5143 *pd80Dst = s_ad80Zeros[fSignIn];
5144 /*
5145 * Denormals are just really tiny sub-zero numbers that are either rounded
5146 * to zero, 1 or -1 depending on sign and rounding control.
5147 */
5148 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
5149 {
5150 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
5151 *pd80Dst = s_ad80Zeros[fSignIn];
5152 else
5153 {
5154 *pd80Dst = s_ad80One[fSignIn];
5155 fFsw |= X86_FSW_C1;
5156 }
5157 fFsw |= X86_FSW_PE;
5158 if (!(fFcw & X86_FCW_PM))
5159 fFsw |= X86_FSW_ES | X86_FSW_B;
5160 }
5161 /*
5162 * All other special values are considered invalid arguments and result
5163 * in an IE exception and indefinite value if masked.
5164 */
5165 else
5166 {
5167 fFsw |= X86_FSW_IE;
5168 if (fFcw & X86_FCW_IM)
5169 *pd80Dst = s_d80Indefinite;
5170 else
5171 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
5172 }
5173 *pu16FSW = fFsw;
5174}
5175
5176
5177/*********************************************************************************************************************************
5178* FPU Helpers *
5179*********************************************************************************************************************************/
5180AssertCompileSize(RTFLOAT128U, 16);
5181AssertCompileSize(RTFLOAT80U, 10);
5182AssertCompileSize(RTFLOAT64U, 8);
5183AssertCompileSize(RTFLOAT32U, 4);
5184
5185/**
5186 * Normalizes a possible pseudo-normal value.
5187 *
5188 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
5189 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
5190 * i.e. changing uExponent from 0 to 1.
5191 *
5192 * This macro will declare a RTFLOAT80U with the name given by
5193 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
5194 * a normalization was performed.
5195 *
5196 * @note This must be applied before calling SoftFloat with a value that couldbe
5197 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
5198 * correctly.
5199 */
5200#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
5201 RTFLOAT80U a_r80ValNormalized; \
5202 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
5203 { \
5204 a_r80ValNormalized = *a_pr80Val; \
5205 a_r80ValNormalized.s.uExponent = 1; \
5206 a_pr80Val = &a_r80ValNormalized; \
5207 } else do {} while (0)
5208
5209#ifdef IEM_WITH_FLOAT128_FOR_FPU
5210
5211DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
5212{
5213 int fNew;
5214 switch (fFcw & X86_FCW_RC_MASK)
5215 {
5216 default:
5217 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
5218 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
5219 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
5220 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
5221 }
5222 int fOld = fegetround();
5223 fesetround(fNew);
5224 return fOld;
5225}
5226
5227
5228DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
5229{
5230 fesetround(fOld);
5231}
5232
5233DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
5234{
5235 RT_NOREF(fFcw);
5236 RTFLOAT128U Tmp;
5237 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
5238 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
5239 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
5240 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
5241 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
5242 {
5243 Assert(Tmp.s.uExponent == 0);
5244 Tmp.s2.uSignAndExponent++;
5245 }
5246 return *(_Float128 *)&Tmp;
5247}
5248
5249
5250DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5251{
5252 RT_NOREF(fFcw);
5253 RTFLOAT128U Tmp;
5254 *(_Float128 *)&Tmp = rd128ValSrc;
5255 ASMCompilerBarrier();
5256 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5257 {
5258 pr80Dst->s.fSign = Tmp.s64.fSign;
5259 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5260 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5261 | Tmp.s64.uFractionLo >> (64 - 15);
5262
5263 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5264 unsigned const cShiftOff = 64 - 15;
5265 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5266 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5267 if (uRoundedOff)
5268 {
5269 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5270 ? RT_BIT_64(cShiftOff - 1)
5271 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5272 ? fRoundingOffMask
5273 : 0;
5274 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5275 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5276 || uRoundedOff != uRoundingAdd)
5277 {
5278 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5279 {
5280 uFraction += 1;
5281 if (!(uFraction & RT_BIT_64(63)))
5282 { /* likely */ }
5283 else
5284 {
5285 uFraction >>= 1;
5286 pr80Dst->s.uExponent++;
5287 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5288 return fFsw;
5289 }
5290 fFsw |= X86_FSW_C1;
5291 }
5292 }
5293 fFsw |= X86_FSW_PE;
5294 if (!(fFcw & X86_FCW_PM))
5295 fFsw |= X86_FSW_ES | X86_FSW_B;
5296 }
5297 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5298 }
5299 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5300 {
5301 pr80Dst->s.fSign = Tmp.s64.fSign;
5302 pr80Dst->s.uExponent = 0;
5303 pr80Dst->s.uMantissa = 0;
5304 }
5305 else if (RTFLOAT128U_IS_INF(&Tmp))
5306 {
5307 pr80Dst->s.fSign = Tmp.s64.fSign;
5308 pr80Dst->s.uExponent = 0;
5309 pr80Dst->s.uMantissa = 0;
5310 }
5311 return fFsw;
5312}
5313
5314
5315#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5316
5317/** Initializer for the SoftFloat state structure. */
5318# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5319 { \
5320 softfloat_tininess_afterRounding, \
5321 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5322 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5323 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5324 : (uint8_t)softfloat_round_minMag, \
5325 0, \
5326 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5327 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5328 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5329 }
5330
5331/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5332# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5333 ( (a_fFsw) \
5334 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5335 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5336 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5337 ? X86_FSW_ES | X86_FSW_B : 0) )
5338
5339
5340DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5341{
5342 RT_NOREF(fFcw);
5343 Assert(cBits > 64);
5344# if 0 /* rounding does not seem to help */
5345 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5346 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5347 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5348 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5349 {
5350 uint64_t uOld = r128.v[0];
5351 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5352 if (r128.v[0] < uOld)
5353 r128.v[1] += 1;
5354 }
5355# else
5356 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5357# endif
5358 return r128;
5359}
5360
5361
5362DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5363{
5364 RT_NOREF(fFcw);
5365 Assert(cBits > 64);
5366# if 0 /* rounding does not seem to help, not even on constants */
5367 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5368 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5369 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5370 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5371 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5372 {
5373 uint64_t uOld = r128.v[0];
5374 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5375 if (r128.v[0] < uOld)
5376 r128.v[1] += 1;
5377 }
5378 return r128;
5379# else
5380 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5381 return r128;
5382# endif
5383}
5384
5385
5386# if 0 /* unused */
5387DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5388{
5389 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5390 return r128;
5391}
5392# endif
5393
5394
5395/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5396DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5397{
5398 extFloat80_t Tmp;
5399 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5400 Tmp.signif = pr80Val->s2.uMantissa;
5401 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5402 return extF80_to_f128(Tmp, &Ignored);
5403}
5404
5405
5406/**
5407 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5408 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5409 *
5410 * This is only a structure format conversion, nothing else.
5411 */
5412DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5413{
5414 extFloat80_t Tmp;
5415 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5416 Tmp.signif = pr80Val->s2.uMantissa;
5417 return Tmp;
5418}
5419
5420
5421/**
5422 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5423 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5424 *
5425 * This is only a structure format conversion, nothing else.
5426 */
5427DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5428{
5429 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5430 pr80Dst->s2.uMantissa = r80XSrc.signif;
5431 return pr80Dst;
5432}
5433
5434
5435DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5436{
5437 RT_NOREF(fFcw);
5438 RTFLOAT128U Tmp;
5439 *(float128_t *)&Tmp = r128Src;
5440 ASMCompilerBarrier();
5441
5442 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5443 {
5444 pr80Dst->s.fSign = Tmp.s64.fSign;
5445 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5446 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5447 | Tmp.s64.uFractionLo >> (64 - 15);
5448
5449 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5450 unsigned const cShiftOff = 64 - 15;
5451 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5452 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5453 if (uRoundedOff)
5454 {
5455 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5456 ? RT_BIT_64(cShiftOff - 1)
5457 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5458 ? fRoundingOffMask
5459 : 0;
5460 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5461 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5462 || uRoundedOff != uRoundingAdd)
5463 {
5464 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5465 {
5466 uFraction += 1;
5467 if (!(uFraction & RT_BIT_64(63)))
5468 { /* likely */ }
5469 else
5470 {
5471 uFraction >>= 1;
5472 pr80Dst->s.uExponent++;
5473 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5474 return fFsw;
5475 }
5476 fFsw |= X86_FSW_C1;
5477 }
5478 }
5479 fFsw |= X86_FSW_PE;
5480 if (!(fFcw & X86_FCW_PM))
5481 fFsw |= X86_FSW_ES | X86_FSW_B;
5482 }
5483
5484 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5485 }
5486 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5487 {
5488 pr80Dst->s.fSign = Tmp.s64.fSign;
5489 pr80Dst->s.uExponent = 0;
5490 pr80Dst->s.uMantissa = 0;
5491 }
5492 else if (RTFLOAT128U_IS_INF(&Tmp))
5493 {
5494 pr80Dst->s.fSign = Tmp.s64.fSign;
5495 pr80Dst->s.uExponent = 0x7fff;
5496 pr80Dst->s.uMantissa = 0;
5497 }
5498 return fFsw;
5499}
5500
5501
5502/**
5503 * Helper for transfering exception and C1 to FSW and setting the result value
5504 * accordingly.
5505 *
5506 * @returns Updated FSW.
5507 * @param pSoftState The SoftFloat state following the operation.
5508 * @param r80XResult The result of the SoftFloat operation.
5509 * @param pr80Result Where to store the result for IEM.
5510 * @param fFcw The FPU control word.
5511 * @param fFsw The FSW before the operation, with necessary bits
5512 * cleared and such.
5513 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5514 * raised.
5515 */
5516DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5517 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5518 PCRTFLOAT80U pr80XcptResult)
5519{
5520 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5521 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5522 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5523 fFsw |= X86_FSW_ES | X86_FSW_B;
5524
5525 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5526 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5527 else
5528 {
5529 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5530 *pr80Result = *pr80XcptResult;
5531 }
5532 return fFsw;
5533}
5534
5535
5536/**
5537 * Helper doing polynomial evaluation using Horner's method.
5538 *
5539 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5540 */
5541float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5542 unsigned cPrecision, softfloat_state_t *pSoftState)
5543{
5544 Assert(cHornerConsts > 1);
5545 size_t i = cHornerConsts - 1;
5546 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5547 while (i-- > 0)
5548 {
5549 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5550 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5551 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5552 }
5553 return r128Result;
5554}
5555
5556#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5557
5558
5559/**
5560 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5561 * mantissa, exponent and sign.
5562 *
5563 * @returns Updated FSW.
5564 * @param pr80Dst Where to return the composed value.
5565 * @param fSign The sign.
5566 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5567 * ignored and should be zero. This will probably be
5568 * modified during normalization and rounding.
5569 * @param iExponent Unbiased exponent.
5570 * @param fFcw The FPU control word.
5571 * @param fFsw The FPU status word.
5572 */
5573static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5574 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5575{
5576 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5577
5578 iExponent += RTFLOAT80U_EXP_BIAS;
5579
5580 /* Do normalization if necessary and possible. */
5581 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5582 {
5583 int cShift = 192 - RTUInt256BitCount(puMantissa);
5584 if (iExponent > cShift)
5585 iExponent -= cShift;
5586 else
5587 {
5588 if (fFcw & X86_FCW_UM)
5589 {
5590 if (iExponent > 0)
5591 cShift = --iExponent;
5592 else
5593 cShift = 0;
5594 }
5595 iExponent -= cShift;
5596 }
5597 RTUInt256AssignShiftLeft(puMantissa, cShift);
5598 }
5599
5600 /* Do rounding. */
5601 uint64_t uMantissa = puMantissa->QWords.qw2;
5602 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5603 {
5604 bool fAdd;
5605 switch (fFcw & X86_FCW_RC_MASK)
5606 {
5607 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5608 case X86_FCW_RC_NEAREST:
5609 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5610 {
5611 if ( (uMantissa & 1)
5612 || puMantissa->QWords.qw0 != 0
5613 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5614 {
5615 fAdd = true;
5616 break;
5617 }
5618 uMantissa &= ~(uint64_t)1;
5619 }
5620 fAdd = false;
5621 break;
5622 case X86_FCW_RC_ZERO:
5623 fAdd = false;
5624 break;
5625 case X86_FCW_RC_UP:
5626 fAdd = !fSign;
5627 break;
5628 case X86_FCW_RC_DOWN:
5629 fAdd = fSign;
5630 break;
5631 }
5632 if (fAdd)
5633 {
5634 uint64_t const uTmp = uMantissa;
5635 uMantissa = uTmp + 1;
5636 if (uMantissa < uTmp)
5637 {
5638 uMantissa >>= 1;
5639 uMantissa |= RT_BIT_64(63);
5640 iExponent++;
5641 }
5642 fFsw |= X86_FSW_C1;
5643 }
5644 fFsw |= X86_FSW_PE;
5645 if (!(fFcw & X86_FCW_PM))
5646 fFsw |= X86_FSW_ES | X86_FSW_B;
5647 }
5648
5649 /* Check for underflow (denormals). */
5650 if (iExponent <= 0)
5651 {
5652 if (fFcw & X86_FCW_UM)
5653 {
5654 if (uMantissa & RT_BIT_64(63))
5655 uMantissa >>= 1;
5656 iExponent = 0;
5657 }
5658 else
5659 {
5660 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5661 fFsw |= X86_FSW_ES | X86_FSW_B;
5662 }
5663 fFsw |= X86_FSW_UE;
5664 }
5665 /* Check for overflow */
5666 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5667 {
5668 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5669 }
5670
5671 /* Compose the result. */
5672 pr80Dst->s.uMantissa = uMantissa;
5673 pr80Dst->s.uExponent = iExponent;
5674 pr80Dst->s.fSign = fSign;
5675 return fFsw;
5676}
5677
5678
5679/**
5680 * See also iemAImpl_fld_r80_from_r32
5681 */
5682static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5683{
5684 uint16_t fFsw = 0;
5685 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5686 {
5687 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5688 pr80Dst->sj64.fInteger = 1;
5689 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5690 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5691 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5692 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5693 }
5694 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5695 {
5696 pr80Dst->s.fSign = pr32Val->s.fSign;
5697 pr80Dst->s.uExponent = 0;
5698 pr80Dst->s.uMantissa = 0;
5699 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5700 }
5701 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5702 {
5703 /* Subnormal -> normalized + X86_FSW_DE return. */
5704 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5705 pr80Dst->sj64.fInteger = 1;
5706 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5707 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5708 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5709 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5710 fFsw = X86_FSW_DE;
5711 }
5712 else if (RTFLOAT32U_IS_INF(pr32Val))
5713 {
5714 pr80Dst->s.fSign = pr32Val->s.fSign;
5715 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5716 pr80Dst->s.uMantissa = RT_BIT_64(63);
5717 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5718 }
5719 else
5720 {
5721 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5722 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5723 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5724 pr80Dst->sj64.fInteger = 1;
5725 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5726 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5727 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5728 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5729 }
5730 return fFsw;
5731}
5732
5733
5734/**
5735 * See also iemAImpl_fld_r80_from_r64
5736 */
5737static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5738{
5739 uint16_t fFsw = 0;
5740 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5741 {
5742 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5743 pr80Dst->sj64.fInteger = 1;
5744 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5745 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5746 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5747 }
5748 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5749 {
5750 pr80Dst->s.fSign = pr64Val->s.fSign;
5751 pr80Dst->s.uExponent = 0;
5752 pr80Dst->s.uMantissa = 0;
5753 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5754 }
5755 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5756 {
5757 /* Subnormal values gets normalized. */
5758 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5759 pr80Dst->sj64.fInteger = 1;
5760 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5761 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5762 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5763 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5764 fFsw = X86_FSW_DE;
5765 }
5766 else if (RTFLOAT64U_IS_INF(pr64Val))
5767 {
5768 pr80Dst->s.fSign = pr64Val->s.fSign;
5769 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5770 pr80Dst->s.uMantissa = RT_BIT_64(63);
5771 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5772 }
5773 else
5774 {
5775 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5776 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5777 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5778 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5779 pr80Dst->sj64.fInteger = 1;
5780 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5781 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5782 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5783 }
5784 return fFsw;
5785}
5786
5787
5788/**
5789 * See also EMIT_FILD.
5790 */
5791#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5792static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5793{ \
5794 if (iVal == 0) \
5795 { \
5796 pr80Dst->s.fSign = 0; \
5797 pr80Dst->s.uExponent = 0; \
5798 pr80Dst->s.uMantissa = 0; \
5799 } \
5800 else \
5801 { \
5802 if (iVal > 0) \
5803 pr80Dst->s.fSign = 0; \
5804 else \
5805 { \
5806 pr80Dst->s.fSign = 1; \
5807 iVal = -iVal; \
5808 } \
5809 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5810 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5811 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5812 } \
5813 return pr80Dst; \
5814}
5815EMIT_CONVERT_IXX_TO_R80(16)
5816EMIT_CONVERT_IXX_TO_R80(32)
5817//EMIT_CONVERT_IXX_TO_R80(64)
5818
5819/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5820#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5821IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5822{ \
5823 RTFLOAT80U r80Val2; \
5824 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5825 Assert(!fFsw || fFsw == X86_FSW_DE); \
5826 if (fFsw) \
5827 { \
5828 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5829 fFsw = 0; \
5830 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5831 { \
5832 pFpuRes->r80Result = *pr80Val1; \
5833 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5834 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5835 return; \
5836 } \
5837 } \
5838 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5839 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5840}
5841
5842/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5843#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5844IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5845{ \
5846 RTFLOAT80U r80Val2; \
5847 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5848 Assert(!fFsw || fFsw == X86_FSW_DE); \
5849 if (fFsw) \
5850 { \
5851 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5852 fFsw = 0; \
5853 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5854 { \
5855 pFpuRes->r80Result = *pr80Val1; \
5856 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5857 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5858 return; \
5859 } \
5860 } \
5861 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5862 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5863}
5864
5865/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5866#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5867IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5868{ \
5869 RTFLOAT80U r80Val2; \
5870 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5871 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5872}
5873
5874/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5875#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5876IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5877{ \
5878 RTFLOAT80U r80Val2; \
5879 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5880 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5881}
5882
5883
5884
5885/*********************************************************************************************************************************
5886* x86 FPU Division Operations *
5887*********************************************************************************************************************************/
5888
5889/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5890static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5891 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5892{
5893 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5894 {
5895 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5896 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5897 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5898 }
5899 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5900 { /* Div by zero. */
5901 if (fFcw & X86_FCW_ZM)
5902 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5903 else
5904 {
5905 *pr80Result = *pr80Val1Org;
5906 fFsw |= X86_FSW_ES | X86_FSW_B;
5907 }
5908 fFsw |= X86_FSW_ZE;
5909 }
5910 else
5911 { /* Invalid operand */
5912 if (fFcw & X86_FCW_IM)
5913 *pr80Result = g_r80Indefinite;
5914 else
5915 {
5916 *pr80Result = *pr80Val1Org;
5917 fFsw |= X86_FSW_ES | X86_FSW_B;
5918 }
5919 fFsw |= X86_FSW_IE;
5920 }
5921 return fFsw;
5922}
5923
5924
5925IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5926 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5927{
5928 uint16_t const fFcw = pFpuState->FCW;
5929 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5930
5931 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5932 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5933 {
5934 if (fFcw & X86_FCW_IM)
5935 pFpuRes->r80Result = g_r80Indefinite;
5936 else
5937 {
5938 pFpuRes->r80Result = *pr80Val1;
5939 fFsw |= X86_FSW_ES | X86_FSW_B;
5940 }
5941 fFsw |= X86_FSW_IE;
5942 }
5943 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5944 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5945 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5946 {
5947 if (fFcw & X86_FCW_DM)
5948 {
5949 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5950 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5951 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5952 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5953 }
5954 else
5955 {
5956 pFpuRes->r80Result = *pr80Val1;
5957 fFsw |= X86_FSW_ES | X86_FSW_B;
5958 }
5959 fFsw |= X86_FSW_DE;
5960 }
5961 /* SoftFloat can handle the rest: */
5962 else
5963 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5964
5965 pFpuRes->FSW = fFsw;
5966}
5967
5968
5969EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5970EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5971EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5972EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5973
5974
5975IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5976 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5977{
5978 uint16_t const fFcw = pFpuState->FCW;
5979 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5980
5981 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5982 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5983 {
5984 if (fFcw & X86_FCW_IM)
5985 pFpuRes->r80Result = g_r80Indefinite;
5986 else
5987 {
5988 pFpuRes->r80Result = *pr80Val1;
5989 fFsw |= X86_FSW_ES | X86_FSW_B;
5990 }
5991 fFsw |= X86_FSW_IE;
5992 }
5993 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5994 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5995 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5996 {
5997 if (fFcw & X86_FCW_DM)
5998 {
5999 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6000 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6001 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6002 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6003 }
6004 else
6005 {
6006 pFpuRes->r80Result = *pr80Val1;
6007 fFsw |= X86_FSW_ES | X86_FSW_B;
6008 }
6009 fFsw |= X86_FSW_DE;
6010 }
6011 /* SoftFloat can handle the rest: */
6012 else
6013 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6014
6015 pFpuRes->FSW = fFsw;
6016}
6017
6018
6019EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6020EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
6021EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
6022EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
6023
6024
6025/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
6026static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6027 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
6028{
6029 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
6030 {
6031 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6032 uint16_t fCxFlags = 0;
6033 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
6034 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
6035 &fCxFlags, &SoftState);
6036 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
6037 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6038 if ( !(fFsw & X86_FSW_IE)
6039 && !RTFLOAT80U_IS_NAN(pr80Result)
6040 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
6041 {
6042 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
6043 fFsw |= fCxFlags & X86_FSW_C_MASK;
6044 }
6045 return fFsw;
6046 }
6047
6048 /* Invalid operand */
6049 if (fFcw & X86_FCW_IM)
6050 *pr80Result = g_r80Indefinite;
6051 else
6052 {
6053 *pr80Result = *pr80Val1Org;
6054 fFsw |= X86_FSW_ES | X86_FSW_B;
6055 }
6056 return fFsw | X86_FSW_IE;
6057}
6058
6059
6060static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6061 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
6062{
6063 uint16_t const fFcw = pFpuState->FCW;
6064 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6065
6066 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
6067 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
6068 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
6069 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
6070 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
6071 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
6072 {
6073 if (fFcw & X86_FCW_IM)
6074 pFpuRes->r80Result = g_r80Indefinite;
6075 else
6076 {
6077 pFpuRes->r80Result = *pr80Val1;
6078 fFsw |= X86_FSW_ES | X86_FSW_B;
6079 }
6080 fFsw |= X86_FSW_IE;
6081 }
6082 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
6083 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
6084 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
6085 {
6086 if (fFcw & X86_FCW_DM)
6087 {
6088 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6089 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6090 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6091 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6092 pr80Val1Org, fLegacyInstr);
6093 }
6094 else
6095 {
6096 pFpuRes->r80Result = *pr80Val1;
6097 fFsw |= X86_FSW_ES | X86_FSW_B;
6098 }
6099 fFsw |= X86_FSW_DE;
6100 }
6101 /* SoftFloat can handle the rest: */
6102 else
6103 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
6104 pr80Val1, fLegacyInstr);
6105
6106 pFpuRes->FSW = fFsw;
6107}
6108
6109
6110IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6111 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6112{
6113 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
6114}
6115
6116
6117IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6118 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6119{
6120 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
6121}
6122
6123
6124/*********************************************************************************************************************************
6125* x87 FPU Multiplication Operations *
6126*********************************************************************************************************************************/
6127
6128/** Worker for iemAImpl_fmul_r80_by_r80. */
6129static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6130 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6131{
6132 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6133 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6134 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6135}
6136
6137
6138IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6139 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6140{
6141 uint16_t const fFcw = pFpuState->FCW;
6142 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6143
6144 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6145 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6146 {
6147 if (fFcw & X86_FCW_IM)
6148 pFpuRes->r80Result = g_r80Indefinite;
6149 else
6150 {
6151 pFpuRes->r80Result = *pr80Val1;
6152 fFsw |= X86_FSW_ES | X86_FSW_B;
6153 }
6154 fFsw |= X86_FSW_IE;
6155 }
6156 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6157 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6158 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6159 {
6160 if (fFcw & X86_FCW_DM)
6161 {
6162 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6163 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6164 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6165 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6166 }
6167 else
6168 {
6169 pFpuRes->r80Result = *pr80Val1;
6170 fFsw |= X86_FSW_ES | X86_FSW_B;
6171 }
6172 fFsw |= X86_FSW_DE;
6173 }
6174 /* SoftFloat can handle the rest: */
6175 else
6176 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6177
6178 pFpuRes->FSW = fFsw;
6179}
6180
6181
6182EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
6183EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
6184EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
6185EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
6186
6187
6188/*********************************************************************************************************************************
6189* x87 FPU Addition *
6190*********************************************************************************************************************************/
6191
6192/** Worker for iemAImpl_fadd_r80_by_r80. */
6193static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6194 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6195{
6196 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6197 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6198 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6199}
6200
6201
6202IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6203 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6204{
6205 uint16_t const fFcw = pFpuState->FCW;
6206 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6207
6208 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6209 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6210 {
6211 if (fFcw & X86_FCW_IM)
6212 pFpuRes->r80Result = g_r80Indefinite;
6213 else
6214 {
6215 pFpuRes->r80Result = *pr80Val1;
6216 fFsw |= X86_FSW_ES | X86_FSW_B;
6217 }
6218 fFsw |= X86_FSW_IE;
6219 }
6220 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6221 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6222 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6223 {
6224 if (fFcw & X86_FCW_DM)
6225 {
6226 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6227 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6228 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6229 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6230 }
6231 else
6232 {
6233 pFpuRes->r80Result = *pr80Val1;
6234 fFsw |= X86_FSW_ES | X86_FSW_B;
6235 }
6236 fFsw |= X86_FSW_DE;
6237 }
6238 /* SoftFloat can handle the rest: */
6239 else
6240 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6241
6242 pFpuRes->FSW = fFsw;
6243}
6244
6245
6246EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6247EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6248EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6249EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6250
6251
6252/*********************************************************************************************************************************
6253* x87 FPU Subtraction *
6254*********************************************************************************************************************************/
6255
6256/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6257static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6258 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6259{
6260 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6261 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6262 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6263}
6264
6265
6266IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6267 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6268{
6269 uint16_t const fFcw = pFpuState->FCW;
6270 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6271
6272 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6273 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6274 {
6275 if (fFcw & X86_FCW_IM)
6276 pFpuRes->r80Result = g_r80Indefinite;
6277 else
6278 {
6279 pFpuRes->r80Result = *pr80Val1;
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 }
6282 fFsw |= X86_FSW_IE;
6283 }
6284 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6285 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6286 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6287 {
6288 if (fFcw & X86_FCW_DM)
6289 {
6290 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6291 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6292 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6293 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6294 }
6295 else
6296 {
6297 pFpuRes->r80Result = *pr80Val1;
6298 fFsw |= X86_FSW_ES | X86_FSW_B;
6299 }
6300 fFsw |= X86_FSW_DE;
6301 }
6302 /* SoftFloat can handle the rest: */
6303 else
6304 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6305
6306 pFpuRes->FSW = fFsw;
6307}
6308
6309
6310EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6311EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6312EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6313EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6314
6315
6316/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6317IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6318 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6319{
6320 uint16_t const fFcw = pFpuState->FCW;
6321 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6322
6323 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6324 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6325 {
6326 if (fFcw & X86_FCW_IM)
6327 pFpuRes->r80Result = g_r80Indefinite;
6328 else
6329 {
6330 pFpuRes->r80Result = *pr80Val1;
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333 fFsw |= X86_FSW_IE;
6334 }
6335 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6336 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6337 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6338 {
6339 if (fFcw & X86_FCW_DM)
6340 {
6341 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6342 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6343 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6344 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6345 }
6346 else
6347 {
6348 pFpuRes->r80Result = *pr80Val1;
6349 fFsw |= X86_FSW_ES | X86_FSW_B;
6350 }
6351 fFsw |= X86_FSW_DE;
6352 }
6353 /* SoftFloat can handle the rest: */
6354 else
6355 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6356
6357 pFpuRes->FSW = fFsw;
6358}
6359
6360
6361EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6362EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6363EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6364EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6365
6366
6367/*********************************************************************************************************************************
6368* x87 FPU Trigometric Operations *
6369*********************************************************************************************************************************/
6370static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6371{
6372 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6373 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6374 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6375 extFloat80_t v;
6376 (void)fFcw;
6377
6378 v = extF80_atan2(y, x, &SoftState);
6379
6380 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6381 return fFsw;
6382}
6383
6384IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6385 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6386{
6387 uint16_t const fFcw = pFpuState->FCW;
6388 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6389
6390 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6391 {
6392 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6393
6394 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6395 if (!(fFcw & X86_FCW_PM))
6396 fFsw |= X86_FSW_ES | X86_FSW_B;
6397 }
6398 else
6399 {
6400 fFsw |= X86_FSW_IE;
6401 if (!(fFcw & X86_FCW_IM))
6402 {
6403 pFpuRes->r80Result = *pr80Val2;
6404 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6405 }
6406 else
6407 {
6408 pFpuRes->r80Result = g_r80Indefinite;
6409 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6410 }
6411 }
6412
6413 pFpuRes->FSW = fFsw;
6414}
6415#endif /* IEM_WITHOUT_ASSEMBLY */
6416
6417IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6418 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6419{
6420 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6421}
6422
6423IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6424 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6425{
6426 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6427}
6428
6429
6430#if defined(IEM_WITHOUT_ASSEMBLY)
6431static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6432{
6433 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6434 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6435 extFloat80_t v;
6436 (void)fFcw;
6437
6438 v = extF80_tan(x, &SoftState);
6439
6440 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6441 return fFsw;
6442}
6443
6444IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6445{
6446 uint16_t const fFcw = pFpuState->FCW;
6447 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6448
6449 if (RTFLOAT80U_IS_ZERO(pr80Val))
6450 {
6451 pFpuResTwo->r80Result1 = *pr80Val;
6452 pFpuResTwo->r80Result2 = g_ar80One[0];
6453 }
6454 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6455 {
6456 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6457 {
6458 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6459 pFpuResTwo->r80Result1 = *pr80Val;
6460 }
6461 else
6462 {
6463 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6464 {
6465 pFpuResTwo->r80Result1 = *pr80Val;
6466 }
6467 else
6468 {
6469 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6470 }
6471
6472 pFpuResTwo->r80Result2 = g_ar80One[0];
6473
6474 fFsw |= X86_FSW_PE;
6475 if (!(fFcw & X86_FCW_PM))
6476 fFsw |= X86_FSW_ES | X86_FSW_B;
6477 }
6478 }
6479 else
6480 {
6481 fFsw |= X86_FSW_IE;
6482 if (!(fFcw & X86_FCW_IM))
6483 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6484 }
6485
6486 pFpuResTwo->FSW = fFsw;
6487}
6488#endif /* IEM_WITHOUT_ASSEMBLY */
6489
6490IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6491{
6492 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6493}
6494
6495IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6496{
6497 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6498}
6499
6500#ifdef IEM_WITHOUT_ASSEMBLY
6501
6502static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6503{
6504 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6505 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6506 extFloat80_t v;
6507 (void)fFcw;
6508
6509 v = extF80_sin(x, &SoftState);
6510
6511 iemFpuSoftF80ToIprt(pr80Result, v);
6512
6513 return fFsw;
6514}
6515
6516IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6517{
6518 uint16_t const fFcw = pFpuState->FCW;
6519 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6520
6521 if (RTFLOAT80U_IS_ZERO(pr80Val))
6522 {
6523 pFpuRes->r80Result = *pr80Val;
6524 }
6525 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6526 {
6527 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6528 {
6529 fFsw |= X86_FSW_C2;
6530 pFpuRes->r80Result = *pr80Val;
6531 }
6532 else
6533 {
6534 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6535 {
6536 pFpuRes->r80Result = *pr80Val;
6537 }
6538 else
6539 {
6540 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6541 }
6542 fFsw |= X86_FSW_PE;
6543 if (!(fFcw & X86_FCW_PM))
6544 fFsw |= X86_FSW_ES | X86_FSW_B;
6545 }
6546 }
6547 else if (RTFLOAT80U_IS_INF(pr80Val))
6548 {
6549 fFsw |= X86_FSW_IE;
6550 if (!(fFcw & X86_FCW_IM))
6551 {
6552 fFsw |= X86_FSW_ES | X86_FSW_B;
6553 pFpuRes->r80Result = *pr80Val;
6554 }
6555 else
6556 {
6557 pFpuRes->r80Result = g_r80Indefinite;
6558 }
6559 }
6560 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6561 {
6562 fFsw |= X86_FSW_DE;
6563
6564 if (fFcw & X86_FCW_DM)
6565 {
6566 if (fFcw & X86_FCW_UM)
6567 {
6568 pFpuRes->r80Result = *pr80Val;
6569 }
6570 else
6571 {
6572 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6573 uint64_t uMantissa = pr80Val->s.uMantissa;
6574 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6575
6576 uExponent = 64 - uExponent;
6577 uMantissa <<= uExponent;
6578 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6579
6580 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6581 pFpuRes->r80Result.s.uMantissa = uMantissa;
6582 pFpuRes->r80Result.s.uExponent = uExponent;
6583 }
6584
6585 fFsw |= X86_FSW_UE | X86_FSW_PE;
6586
6587 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6588 {
6589 /* All the exceptions are masked. */
6590 }
6591 else
6592 {
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else
6597 {
6598 pFpuRes->r80Result = *pr80Val;
6599
6600 fFsw |= X86_FSW_ES | X86_FSW_B;
6601 }
6602 }
6603 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6604 {
6605 pFpuRes->r80Result = *pr80Val;
6606 fFsw |= X86_FSW_DE;
6607
6608 if (fFcw & X86_FCW_DM)
6609 {
6610 if (fFcw & X86_FCW_PM)
6611 {
6612 fFsw |= X86_FSW_PE;
6613 }
6614 else
6615 {
6616 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6617 }
6618
6619 pFpuRes->r80Result.sj64.uExponent = 1;
6620 }
6621 else
6622 {
6623 fFsw |= X86_FSW_ES | X86_FSW_B;
6624 }
6625 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6626 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6627 {
6628 pFpuRes->r80Result = *pr80Val;
6629 } else {
6630 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6631 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6632 && (fFcw & X86_FCW_IM))
6633 pFpuRes->r80Result = g_r80Indefinite;
6634 else
6635 {
6636 pFpuRes->r80Result = *pr80Val;
6637 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6638 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6639 }
6640
6641 fFsw |= X86_FSW_IE;
6642 if (!(fFcw & X86_FCW_IM))
6643 fFsw |= X86_FSW_ES | X86_FSW_B;
6644 }
6645
6646 pFpuRes->FSW = fFsw;
6647}
6648#endif /* IEM_WITHOUT_ASSEMBLY */
6649
6650IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6651{
6652 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6653}
6654
6655IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6656{
6657 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6658}
6659
6660#ifdef IEM_WITHOUT_ASSEMBLY
6661
6662static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6663{
6664 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6665 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6666 extFloat80_t v;
6667 (void)fFcw;
6668
6669 v = extF80_cos(x, &SoftState);
6670
6671 iemFpuSoftF80ToIprt(pr80Result, v);
6672
6673 return fFsw;
6674}
6675
6676IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6677{
6678 uint16_t const fFcw = pFpuState->FCW;
6679 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6680
6681 if (RTFLOAT80U_IS_ZERO(pr80Val))
6682 {
6683 pFpuRes->r80Result = g_ar80One[0];
6684 }
6685 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6686 {
6687 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6688 {
6689 fFsw |= X86_FSW_C2;
6690 pFpuRes->r80Result = *pr80Val;
6691 }
6692 else
6693 {
6694 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6695 {
6696 pFpuRes->r80Result = g_ar80One[0];
6697
6698 }
6699 else
6700 {
6701 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6702 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6703 }
6704 fFsw |= X86_FSW_PE;
6705 if (!(fFcw & X86_FCW_PM))
6706 fFsw |= X86_FSW_ES | X86_FSW_B;
6707 }
6708 }
6709 else if (RTFLOAT80U_IS_INF(pr80Val))
6710 {
6711 fFsw |= X86_FSW_IE;
6712 if (!(fFcw & X86_FCW_IM))
6713 {
6714 fFsw |= X86_FSW_ES | X86_FSW_B;
6715 pFpuRes->r80Result = *pr80Val;
6716 }
6717 else
6718 {
6719 pFpuRes->r80Result = g_r80Indefinite;
6720 }
6721 }
6722 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6723 {
6724 fFsw |= X86_FSW_DE;
6725
6726 if (fFcw & X86_FCW_DM)
6727 {
6728 pFpuRes->r80Result = g_ar80One[0];
6729
6730 if (fFcw & X86_FCW_PM)
6731 {
6732 fFsw |= X86_FSW_PE;
6733 }
6734 else
6735 {
6736 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6737 }
6738 }
6739 else
6740 {
6741 pFpuRes->r80Result = *pr80Val;
6742 fFsw |= X86_FSW_ES | X86_FSW_B;
6743 }
6744 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6745 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6746 {
6747 pFpuRes->r80Result = *pr80Val;
6748 } else {
6749 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6750 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6751 && (fFcw & X86_FCW_IM))
6752 pFpuRes->r80Result = g_r80Indefinite;
6753 else
6754 {
6755 pFpuRes->r80Result = *pr80Val;
6756 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6757 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6758 }
6759
6760 fFsw |= X86_FSW_IE;
6761 if (!(fFcw & X86_FCW_IM))
6762 fFsw |= X86_FSW_ES | X86_FSW_B;
6763 }
6764
6765 pFpuRes->FSW = fFsw;
6766}
6767#endif /* IEM_WITHOUT_ASSEMBLY */
6768
6769IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6770{
6771 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6772}
6773
6774IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6775{
6776 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6777}
6778
6779#ifdef IEM_WITHOUT_ASSEMBLY
6780
6781static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6782{
6783 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6784 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6785 extFloat80_t r80Sin, r80Cos;
6786 (void)fFcw;
6787
6788 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6789
6790 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6791 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6792
6793 return fFsw;
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6797{
6798 uint16_t const fFcw = pFpuState->FCW;
6799 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6800
6801 if (RTFLOAT80U_IS_ZERO(pr80Val))
6802 {
6803 pFpuResTwo->r80Result1 = *pr80Val;
6804 pFpuResTwo->r80Result2 = g_ar80One[0];
6805 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6806 }
6807 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6808 {
6809 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6810 {
6811 fFsw |= X86_FSW_C2;
6812
6813 if (fFcw & X86_FCW_IM)
6814 {
6815 pFpuResTwo->r80Result1 = g_r80Indefinite;
6816 }
6817 else
6818 {
6819 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6820 }
6821
6822 pFpuResTwo->r80Result2 = *pr80Val;
6823 }
6824 else
6825 {
6826 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6827
6828 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6829 {
6830 pFpuResTwo->r80Result1 = *pr80Val;
6831 pFpuResTwo->r80Result2 = g_ar80One[0];
6832 }
6833 else
6834 {
6835 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6836 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6837 }
6838 fFsw |= X86_FSW_PE;
6839 if (!(fFcw & X86_FCW_PM))
6840 fFsw |= X86_FSW_ES | X86_FSW_B;
6841 }
6842 }
6843 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6844 {
6845 fFsw |= X86_FSW_DE;
6846
6847 if (fFcw & X86_FCW_DM)
6848 {
6849 pFpuResTwo->r80Result1 = *pr80Val;
6850 pFpuResTwo->r80Result2 = g_ar80One[0];
6851 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6852
6853 if (fFcw & X86_FCW_PM)
6854 {
6855 fFsw |= X86_FSW_PE;
6856 }
6857 else
6858 {
6859 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6860 }
6861
6862 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6863 }
6864 else
6865 {
6866 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw |= X86_FSW_ES | X86_FSW_B;
6869 }
6870 }
6871 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6872 {
6873 fFsw |= X86_FSW_DE;
6874
6875 if (fFcw & X86_FCW_DM)
6876 {
6877 pFpuResTwo->r80Result2 = g_ar80One[0];
6878
6879 if (fFcw & X86_FCW_UM)
6880 {
6881 pFpuResTwo->r80Result1 = *pr80Val;
6882 }
6883 else
6884 {
6885 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6886 uint64_t uMantissa = pr80Val->s.uMantissa;
6887 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6888
6889 uExponent = 64 - uExponent;
6890 uMantissa <<= uExponent;
6891 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6892
6893 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6894 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6895 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6896 }
6897
6898 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6899 fFsw |= X86_FSW_UE | X86_FSW_PE;
6900
6901 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6902 {
6903 /* All the exceptions are masked. */
6904 }
6905 else
6906 {
6907 fFsw |= X86_FSW_ES | X86_FSW_B;
6908 }
6909 }
6910 else
6911 {
6912 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6913 pFpuResTwo->r80Result2 = *pr80Val;
6914 fFsw |= X86_FSW_ES | X86_FSW_B;
6915 }
6916 }
6917 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6918 {
6919 pFpuResTwo->r80Result1 = *pr80Val;
6920 pFpuResTwo->r80Result2 = *pr80Val;
6921 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6922 }
6923 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6924 {
6925 if (fFcw & X86_FCW_IM)
6926 {
6927 pFpuResTwo->r80Result1 = g_r80Indefinite;
6928 pFpuResTwo->r80Result2 = g_r80Indefinite;
6929 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6930 }
6931 else
6932 {
6933 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6934 pFpuResTwo->r80Result2 = *pr80Val;
6935 }
6936
6937 fFsw |= X86_FSW_IE;
6938 if (!(fFcw & X86_FCW_IM))
6939 fFsw |= X86_FSW_ES | X86_FSW_B;
6940 }
6941 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6942 {
6943 pFpuResTwo->r80Result1 = *pr80Val;
6944 pFpuResTwo->r80Result2 = *pr80Val;
6945
6946 if (fFcw & X86_FCW_IM)
6947 {
6948 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6949 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6950 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6951 }
6952 else
6953 {
6954 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6955 pFpuResTwo->r80Result2 = *pr80Val;
6956 }
6957
6958 fFsw |= X86_FSW_IE;
6959 if (!(fFcw & X86_FCW_IM))
6960 fFsw |= X86_FSW_ES | X86_FSW_B;
6961 }
6962 else if (RTFLOAT80U_IS_INF(pr80Val))
6963 {
6964 if (fFcw & X86_FCW_IM)
6965 {
6966 pFpuResTwo->r80Result1 = g_r80Indefinite;
6967 pFpuResTwo->r80Result2 = g_r80Indefinite;
6968 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6969 }
6970 else
6971 {
6972 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6973 pFpuResTwo->r80Result2 = *pr80Val;
6974 }
6975
6976 fFsw |= X86_FSW_IE;
6977 if (!(fFcw & X86_FCW_IM))
6978 fFsw |= X86_FSW_ES | X86_FSW_B;
6979 }
6980
6981 pFpuResTwo->FSW = fFsw;
6982}
6983#endif /* IEM_WITHOUT_ASSEMBLY */
6984
6985IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6986{
6987 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6988}
6989
6990IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6991{
6992 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6993}
6994
6995#ifdef IEM_WITHOUT_ASSEMBLY
6996
6997
6998/*********************************************************************************************************************************
6999* x87 FPU Compare and Testing Operations *
7000*********************************************************************************************************************************/
7001
7002IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7003{
7004 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7005
7006 if (RTFLOAT80U_IS_ZERO(pr80Val))
7007 fFsw |= X86_FSW_C3;
7008 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
7009 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
7010 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7011 {
7012 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
7013 if (!(pFpuState->FCW & X86_FCW_DM))
7014 fFsw |= X86_FSW_ES | X86_FSW_B;
7015 }
7016 else
7017 {
7018 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7019 if (!(pFpuState->FCW & X86_FCW_IM))
7020 fFsw |= X86_FSW_ES | X86_FSW_B;
7021 }
7022
7023 *pu16Fsw = fFsw;
7024}
7025
7026
7027IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
7028{
7029 RT_NOREF(pFpuState);
7030 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
7031
7032 /* C1 = sign bit (always, even if empty Intel says). */
7033 if (pr80Val->s.fSign)
7034 fFsw |= X86_FSW_C1;
7035
7036 /* Classify the value in C0, C2, C3. */
7037 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
7038 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
7039 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
7040 fFsw |= X86_FSW_C2;
7041 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7042 fFsw |= X86_FSW_C3;
7043 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
7044 fFsw |= X86_FSW_C0;
7045 else if (RTFLOAT80U_IS_INF(pr80Val))
7046 fFsw |= X86_FSW_C0 | X86_FSW_C2;
7047 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7048 fFsw |= X86_FSW_C2 | X86_FSW_C3;
7049 /* whatever else: 0 */
7050
7051 *pu16Fsw = fFsw;
7052}
7053
7054
7055/**
7056 * Worker for fcom, fucom, and friends.
7057 */
7058static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7059 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
7060{
7061 /*
7062 * Unpack the values.
7063 */
7064 bool const fSign1 = pr80Val1->s.fSign;
7065 int32_t iExponent1 = pr80Val1->s.uExponent;
7066 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
7067
7068 bool const fSign2 = pr80Val2->s.fSign;
7069 int32_t iExponent2 = pr80Val2->s.uExponent;
7070 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
7071
7072 /*
7073 * Check for invalid inputs.
7074 */
7075 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
7076 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
7077 {
7078 if (!(fFcw & X86_FCW_IM))
7079 fFsw |= X86_FSW_ES | X86_FSW_B;
7080 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
7081 }
7082
7083 /*
7084 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
7085 */
7086 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7087 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7088 {
7089 if ( fIeOnAllNaNs
7090 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
7091 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
7092 {
7093 fFsw |= X86_FSW_IE;
7094 if (!(fFcw & X86_FCW_IM))
7095 fFsw |= X86_FSW_ES | X86_FSW_B;
7096 }
7097 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
7098 }
7099
7100 /*
7101 * Normalize the values.
7102 */
7103 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7104 {
7105 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
7106 iExponent1 = 1;
7107 else
7108 {
7109 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
7110 uMantissa1 <<= iExponent1;
7111 iExponent1 = 1 - iExponent1;
7112 }
7113 fFsw |= X86_FSW_DE;
7114 if (!(fFcw & X86_FCW_DM))
7115 fFsw |= X86_FSW_ES | X86_FSW_B;
7116 }
7117
7118 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7119 {
7120 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
7121 iExponent2 = 1;
7122 else
7123 {
7124 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
7125 uMantissa2 <<= iExponent2;
7126 iExponent2 = 1 - iExponent2;
7127 }
7128 fFsw |= X86_FSW_DE;
7129 if (!(fFcw & X86_FCW_DM))
7130 fFsw |= X86_FSW_ES | X86_FSW_B;
7131 }
7132
7133 /*
7134 * Test if equal (val1 == val2):
7135 */
7136 if ( uMantissa1 == uMantissa2
7137 && iExponent1 == iExponent2
7138 && ( fSign1 == fSign2
7139 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
7140 fFsw |= X86_FSW_C3;
7141 /*
7142 * Test if less than (val1 < val2):
7143 */
7144 else if (fSign1 && !fSign2)
7145 fFsw |= X86_FSW_C0;
7146 else if (fSign1 == fSign2)
7147 {
7148 /* Zeros are problematic, however at the most one can be zero here. */
7149 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
7150 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7151 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
7152 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
7153
7154 if ( fSign1
7155 ^ ( iExponent1 < iExponent2
7156 || ( iExponent1 == iExponent2
7157 && uMantissa1 < uMantissa2 ) ) )
7158 fFsw |= X86_FSW_C0;
7159 }
7160 /* else: No flags set if greater. */
7161
7162 return fFsw;
7163}
7164
7165
7166IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7167 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7168{
7169 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7170}
7171
7172
7173
7174
7175IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7176 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7177{
7178 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
7179}
7180
7181
7182IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7183 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
7184{
7185 RTFLOAT80U r80Val2;
7186 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
7187 Assert(!fFsw || fFsw == X86_FSW_DE);
7188 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7189 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7190 {
7191 if (!(pFpuState->FCW & X86_FCW_DM))
7192 fFsw |= X86_FSW_ES | X86_FSW_B;
7193 *pfFsw |= fFsw;
7194 }
7195}
7196
7197
7198IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7199 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
7200{
7201 RTFLOAT80U r80Val2;
7202 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
7203 Assert(!fFsw || fFsw == X86_FSW_DE);
7204 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
7205 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
7206 {
7207 if (!(pFpuState->FCW & X86_FCW_DM))
7208 fFsw |= X86_FSW_ES | X86_FSW_B;
7209 *pfFsw |= fFsw;
7210 }
7211}
7212
7213
7214IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7215 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
7216{
7217 RTFLOAT80U r80Val2;
7218 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
7219 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7220}
7221
7222
7223IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7224 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
7225{
7226 RTFLOAT80U r80Val2;
7227 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
7228 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7229}
7230
7231
7232/**
7233 * Worker for fcomi & fucomi.
7234 */
7235static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
7236 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
7237{
7238 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
7239 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
7240 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
7241 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
7242
7243 /* Note! C1 is not cleared as per docs! Everything is preserved. */
7244 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
7245 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
7246}
7247
7248
7249IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7250 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7251{
7252 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7253}
7254
7255
7256IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7257 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7258{
7259 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7260}
7261
7262
7263/*********************************************************************************************************************************
7264* x87 FPU Other Operations *
7265*********************************************************************************************************************************/
7266
7267/**
7268 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7269 */
7270static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7271{
7272 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7273 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7274 true /*exact / generate #PE */, &SoftState));
7275 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7276}
7277
7278
7279IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7280{
7281 uint16_t const fFcw = pFpuState->FCW;
7282 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7283
7284 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7285 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7286 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7287 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7288 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7289 || RTFLOAT80U_IS_INF(pr80Val))
7290 pFpuRes->r80Result = *pr80Val;
7291 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7292 {
7293 fFsw |= X86_FSW_DE;
7294 if (fFcw & X86_FCW_DM)
7295 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7296 else
7297 {
7298 pFpuRes->r80Result = *pr80Val;
7299 fFsw |= X86_FSW_ES | X86_FSW_B;
7300 }
7301 }
7302 else
7303 {
7304 if (fFcw & X86_FCW_IM)
7305 {
7306 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7307 pFpuRes->r80Result = g_r80Indefinite;
7308 else
7309 {
7310 pFpuRes->r80Result = *pr80Val;
7311 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7312 }
7313 }
7314 else
7315 {
7316 pFpuRes->r80Result = *pr80Val;
7317 fFsw |= X86_FSW_ES | X86_FSW_B;
7318 }
7319 fFsw |= X86_FSW_IE;
7320 }
7321 pFpuRes->FSW = fFsw;
7322}
7323
7324
7325IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7326 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7327{
7328 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7329 it does everything we need it to do. */
7330 uint16_t const fFcw = pFpuState->FCW;
7331 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7332 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7333 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7334 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7335}
7336
7337
7338/**
7339 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7340 */
7341static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7342{
7343 Assert(!pr80Val->s.fSign);
7344 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7345 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7346 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7347}
7348
7349
7350IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7351{
7352 uint16_t const fFcw = pFpuState->FCW;
7353 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7354
7355 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7356 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7357 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7358 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7359 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7360 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7361 pFpuRes->r80Result = *pr80Val;
7362 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7363 {
7364 fFsw |= X86_FSW_DE;
7365 if (fFcw & X86_FCW_DM)
7366 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7367 else
7368 {
7369 pFpuRes->r80Result = *pr80Val;
7370 fFsw |= X86_FSW_ES | X86_FSW_B;
7371 }
7372 }
7373 else
7374 {
7375 if (fFcw & X86_FCW_IM)
7376 {
7377 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7378 pFpuRes->r80Result = g_r80Indefinite;
7379 else
7380 {
7381 pFpuRes->r80Result = *pr80Val;
7382 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7383 }
7384 }
7385 else
7386 {
7387 pFpuRes->r80Result = *pr80Val;
7388 fFsw |= X86_FSW_ES | X86_FSW_B;
7389 }
7390 fFsw |= X86_FSW_IE;
7391 }
7392 pFpuRes->FSW = fFsw;
7393}
7394
7395
7396/**
7397 * @code{.unparsed}
7398 * x x * ln2
7399 * f(x) = 2 - 1 = e - 1
7400 *
7401 * @endcode
7402 *
7403 * We can approximate e^x by a Taylor/Maclaurin series (see
7404 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7405 * @code{.unparsed}
7406 * n 0 1 2 3 4
7407 * inf x x x x x x
7408 * SUM ----- = --- + --- + --- + --- + --- + ...
7409 * n=0 n! 0! 1! 2! 3! 4!
7410 *
7411 * 2 3 4
7412 * x x x
7413 * = 1 + x + --- + --- + --- + ...
7414 * 2! 3! 4!
7415 * @endcode
7416 *
7417 * Given z = x * ln2, we get:
7418 * @code{.unparsed}
7419 * 2 3 4 n
7420 * z z z z z
7421 * e - 1 = z + --- + --- + --- + ... + ---
7422 * 2! 3! 4! n!
7423 * @endcode
7424 *
7425 * Wanting to use Horner's method, we move one z outside and get:
7426 * @code{.unparsed}
7427 * 2 3 (n-1)
7428 * z z z z
7429 * = z ( 1 + --- + --- + --- + ... + ------- )
7430 * 2! 3! 4! n!
7431 * @endcode
7432 *
7433 * The constants we need for using Horner's methods are 1 and 1 / n!.
7434 *
7435 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7436 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7437 * and can approximate it to be 1.0. For a visual demonstration of this
7438 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7439 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7440 *
7441 *
7442 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7443 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7444 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7445 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7446 * blocks). (The one bit difference is probably an implicit one missing from
7447 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7448 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7449 * exponent.
7450 *
7451 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7452 * successfully reproduced the exact results from an Intel 10980XE, there is
7453 * always a portition of rounding differences. Not going to spend too much time
7454 * on getting this 100% the same, at least not now.
7455 *
7456 * P.S. If someone are really curious about 8087 and its contstants:
7457 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7458 *
7459 *
7460 * @param pr80Val The exponent value (x), less than 1.0, greater than
7461 * -1.0 and not zero. This can be a normal, denormal
7462 * or pseudo-denormal value.
7463 * @param pr80Result Where to return the result.
7464 * @param fFcw FPU control word.
7465 * @param fFsw FPU status word.
7466 */
7467static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7468{
7469 /* As mentioned above, we can skip the expensive polynomial calculation
7470 as it will be close enough to 1.0 that it makes no difference.
7471
7472 The cutoff point for intel 10980XE is exponents >= -69. Intel
7473 also seems to be using a 67-bit or 68-bit constant value, and we get
7474 a smattering of rounding differences if we go for higher precision. */
7475 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7476 {
7477 RTUINT256U u256;
7478 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7479 u256.QWords.qw0 |= 1; /* force #PE */
7480 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7481 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7482 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7483 : 1 - RTFLOAT80U_EXP_BIAS,
7484 fFcw, fFsw);
7485 }
7486 else
7487 {
7488#ifdef IEM_WITH_FLOAT128_FOR_FPU
7489 /* This approach is not good enough for small values, we end up with zero. */
7490 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7491 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7492 _Float128 rd128Result = powf128(2.0L, rd128Val);
7493 rd128Result -= 1.0L;
7494 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7495 iemFpuF128RestoreRounding(fOldRounding);
7496
7497# else
7498 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7499 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7500
7501 /* As mentioned above, enforce 68-bit internal mantissa width to better
7502 match the Intel 10980XE results. */
7503 unsigned const cPrecision = 68;
7504
7505 /* first calculate z = x * ln2 */
7506 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7507 cPrecision);
7508
7509 /* Then do the polynomial evaluation. */
7510 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7511 cPrecision, &SoftState);
7512 r = f128_mul(z, r, &SoftState);
7513
7514 /* Output the result. */
7515 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7516# endif
7517 }
7518 return fFsw;
7519}
7520
7521
7522IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7523{
7524 uint16_t const fFcw = pFpuState->FCW;
7525 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7526
7527 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7528 {
7529 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7530 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7531 else
7532 {
7533 /* Special case:
7534 2^+1.0 - 1.0 = 1.0
7535 2^-1.0 - 1.0 = -0.5 */
7536 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7537 && pr80Val->s.uMantissa == RT_BIT_64(63))
7538 {
7539 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7540 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7541 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7542 }
7543 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7544 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7545 else
7546 pFpuRes->r80Result = *pr80Val;
7547 fFsw |= X86_FSW_PE;
7548 if (!(fFcw & X86_FCW_PM))
7549 fFsw |= X86_FSW_ES | X86_FSW_B;
7550 }
7551 }
7552 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7553 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7554 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7555 pFpuRes->r80Result = *pr80Val;
7556 else if (RTFLOAT80U_IS_INF(pr80Val))
7557 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7558 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7559 {
7560 fFsw |= X86_FSW_DE;
7561 if (fFcw & X86_FCW_DM)
7562 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7563 else
7564 {
7565 pFpuRes->r80Result = *pr80Val;
7566 fFsw |= X86_FSW_ES | X86_FSW_B;
7567 }
7568 }
7569 else
7570 {
7571 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7572 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7573 && (fFcw & X86_FCW_IM))
7574 pFpuRes->r80Result = g_r80Indefinite;
7575 else
7576 {
7577 pFpuRes->r80Result = *pr80Val;
7578 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7579 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7580 }
7581 fFsw |= X86_FSW_IE;
7582 if (!(fFcw & X86_FCW_IM))
7583 fFsw |= X86_FSW_ES | X86_FSW_B;
7584 }
7585 pFpuRes->FSW = fFsw;
7586}
7587
7588#endif /* IEM_WITHOUT_ASSEMBLY */
7589
7590IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7591{
7592 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7593}
7594
7595IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7596{
7597 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7598}
7599
7600#ifdef IEM_WITHOUT_ASSEMBLY
7601
7602IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7603{
7604 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7605 pFpuRes->r80Result = *pr80Val;
7606 pFpuRes->r80Result.s.fSign = 0;
7607}
7608
7609
7610IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7611{
7612 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7613 pFpuRes->r80Result = *pr80Val;
7614 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7615}
7616
7617
7618IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7619{
7620 uint16_t const fFcw = pFpuState->FCW;
7621 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7622
7623 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7624 {
7625 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7626 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7627
7628 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7629 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7630 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7631 }
7632 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7633 {
7634 fFsw |= X86_FSW_ZE;
7635 if (fFcw & X86_FCW_ZM)
7636 {
7637 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7638 pFpuResTwo->r80Result2 = *pr80Val;
7639 }
7640 else
7641 {
7642 pFpuResTwo->r80Result2 = *pr80Val;
7643 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7644 }
7645 }
7646 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7647 {
7648 fFsw |= X86_FSW_DE;
7649 if (fFcw & X86_FCW_DM)
7650 {
7651 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7652 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7653 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7654 int32_t iExponent = -16382;
7655 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7656 {
7657 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7658 iExponent--;
7659 }
7660
7661 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7662 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7663 }
7664 else
7665 {
7666 pFpuResTwo->r80Result2 = *pr80Val;
7667 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7668 }
7669 }
7670 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7671 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7672 {
7673 pFpuResTwo->r80Result1 = *pr80Val;
7674 pFpuResTwo->r80Result2 = *pr80Val;
7675 }
7676 else if (RTFLOAT80U_IS_INF(pr80Val))
7677 {
7678 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7679 pFpuResTwo->r80Result2 = *pr80Val;
7680 }
7681 else
7682 {
7683 if (fFcw & X86_FCW_IM)
7684 {
7685 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7686 pFpuResTwo->r80Result1 = g_r80Indefinite;
7687 else
7688 {
7689 pFpuResTwo->r80Result1 = *pr80Val;
7690 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7691 }
7692 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7693 }
7694 else
7695 {
7696 pFpuResTwo->r80Result2 = *pr80Val;
7697 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7698 }
7699 fFsw |= X86_FSW_IE;
7700 }
7701 pFpuResTwo->FSW = fFsw;
7702}
7703#endif /* IEM_WITHOUT_ASSEMBLY */
7704
7705#if defined(IEM_WITHOUT_ASSEMBLY)
7706
7707static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7708{
7709 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7710 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7711 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7712 extFloat80_t v;
7713 (void)fFcw;
7714
7715 v = extF80_ylog2x(y, x, &SoftState);
7716 iemFpuSoftF80ToIprt(pr80Result, v);
7717
7718 return fFsw;
7719}
7720
7721IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7722 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7723{
7724 uint16_t const fFcw = pFpuState->FCW;
7725 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7726
7727 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7728 {
7729 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7730
7731 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7732 if (!(fFcw & X86_FCW_PM))
7733 fFsw |= X86_FSW_ES | X86_FSW_B;
7734 }
7735 else
7736 {
7737 fFsw |= X86_FSW_IE;
7738
7739 if (!(fFcw & X86_FCW_IM))
7740 {
7741 pFpuRes->r80Result = *pr80Val2;
7742 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7743 }
7744 else
7745 {
7746 pFpuRes->r80Result = g_r80Indefinite;
7747 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7748 }
7749 }
7750
7751 pFpuRes->FSW = fFsw;
7752}
7753#endif /* IEM_WITHOUT_ASSEMBLY */
7754
7755IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7756 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7757{
7758 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7759}
7760
7761IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7762 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7763{
7764 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7765}
7766
7767#if defined(IEM_WITHOUT_ASSEMBLY)
7768
7769static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7770{
7771 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7772 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7773 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7774 extFloat80_t v;
7775 (void)fFcw;
7776
7777 v = extF80_ylog2xp1(y, x, &SoftState);
7778 iemFpuSoftF80ToIprt(pr80Result, v);
7779
7780 return fFsw;
7781}
7782
7783IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7784 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7785{
7786 uint16_t const fFcw = pFpuState->FCW;
7787 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7788
7789 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7790 {
7791 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7792
7793 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7794 if (!(fFcw & X86_FCW_PM))
7795 fFsw |= X86_FSW_ES | X86_FSW_B;
7796 }
7797 else
7798 {
7799 fFsw |= X86_FSW_IE;
7800
7801 if (!(fFcw & X86_FCW_IM))
7802 {
7803 pFpuRes->r80Result = *pr80Val2;
7804 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7805 }
7806 else
7807 {
7808 pFpuRes->r80Result = g_r80Indefinite;
7809 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7810 }
7811 }
7812
7813 pFpuRes->FSW = fFsw;
7814}
7815
7816#endif /* IEM_WITHOUT_ASSEMBLY */
7817
7818IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7819 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7820{
7821 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7822}
7823
7824IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7825 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7826{
7827 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7828}
7829
7830
7831/*********************************************************************************************************************************
7832* MMX, SSE & AVX *
7833*********************************************************************************************************************************/
7834
7835/*
7836 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7837 */
7838#ifdef IEM_WITHOUT_ASSEMBLY
7839
7840IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(uint64_t *puDst, uint64_t const *puSrc))
7841{
7842 *puDst &= *puSrc;
7843}
7844
7845
7846IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7847{
7848 puDst->au64[0] &= puSrc->au64[0];
7849 puDst->au64[1] &= puSrc->au64[1];
7850}
7851
7852#endif
7853
7854IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7855{
7856 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7857 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7858}
7859
7860
7861IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7862{
7863 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7864 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7865 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7866 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7867}
7868
7869
7870/*
7871 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7872 */
7873#ifdef IEM_WITHOUT_ASSEMBLY
7874
7875IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(uint64_t *puDst, uint64_t const *puSrc))
7876{
7877 *puDst = ~*puDst & *puSrc;
7878}
7879
7880
7881IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7882{
7883 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7884 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7885}
7886
7887#endif
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7890{
7891 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7892 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7893}
7894
7895
7896IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7897{
7898 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7899 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7900 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7901 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7902}
7903
7904
7905/*
7906 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7907 */
7908#ifdef IEM_WITHOUT_ASSEMBLY
7909
7910IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(uint64_t *puDst, uint64_t const *puSrc))
7911{
7912 *puDst |= *puSrc;
7913}
7914
7915
7916IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7917{
7918 puDst->au64[0] |= puSrc->au64[0];
7919 puDst->au64[1] |= puSrc->au64[1];
7920}
7921
7922#endif
7923
7924IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7925{
7926 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7927 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7928}
7929
7930
7931IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7932{
7933 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7934 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7935 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7936 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7937}
7938
7939
7940/*
7941 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7942 */
7943#ifdef IEM_WITHOUT_ASSEMBLY
7944
7945IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(uint64_t *puDst, uint64_t const *puSrc))
7946{
7947 *puDst ^= *puSrc;
7948}
7949
7950
7951IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7952{
7953 puDst->au64[0] ^= puSrc->au64[0];
7954 puDst->au64[1] ^= puSrc->au64[1];
7955}
7956
7957#endif
7958
7959IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7960{
7961 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7962 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7963}
7964
7965
7966IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7967{
7968 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7969 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7970 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7971 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7972}
7973
7974
7975/*
7976 * PCMPEQB / VPCMPEQB
7977 */
7978#ifdef IEM_WITHOUT_ASSEMBLY
7979
7980IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(uint64_t *puDst, uint64_t const *puSrc))
7981{
7982 RTUINT64U uSrc1 = { *puDst };
7983 RTUINT64U uSrc2 = { *puSrc };
7984 RTUINT64U uDst;
7985 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7986 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7987 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7988 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7989 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7990 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7991 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7992 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7993 *puDst = uDst.u;
7994}
7995
7996
7997IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7998{
7999 RTUINT128U uSrc1 = *puDst;
8000 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
8001 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
8002 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
8003 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
8004 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
8005 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
8006 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
8007 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
8008 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
8009 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
8010 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
8011 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
8012 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
8013 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
8014 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
8015 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
8016}
8017
8018#endif
8019
8020IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8021{
8022 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8023 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8024 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8025 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8026 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8027 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8028 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8029 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8030 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8031 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8032 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8033 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8034 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8035 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8036 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8037 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8038}
8039
8040IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8041{
8042 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
8043 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
8044 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
8045 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
8046 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
8047 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
8048 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
8049 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
8050 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
8051 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
8052 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
8053 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
8054 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
8055 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
8056 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
8057 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
8058 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
8059 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
8060 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
8061 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
8062 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
8063 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
8064 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
8065 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
8066 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
8067 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
8068 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
8069 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
8070 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
8071 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
8072 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
8073 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
8074}
8075
8076
8077/*
8078 * PCMPEQW / VPCMPEQW
8079 */
8080#ifdef IEM_WITHOUT_ASSEMBLY
8081
8082IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8083{
8084 RTUINT64U uSrc1 = { *puDst };
8085 RTUINT64U uSrc2 = { *puSrc };
8086 RTUINT64U uDst;
8087 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
8088 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
8089 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
8090 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
8091 *puDst = uDst.u;
8092}
8093
8094
8095IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8096{
8097 RTUINT128U uSrc1 = *puDst;
8098 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
8099 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
8100 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
8101 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
8102 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
8103 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
8104 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
8105 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
8106}
8107
8108#endif
8109
8110IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8111{
8112 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8113 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8114 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8115 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8116 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8117 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8118 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8119 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8120}
8121
8122IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8123{
8124 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8125 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8126 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8127 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8128 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8129 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8130 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8131 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8132 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8133 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8134 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8135 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8136 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8137 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8138 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8139 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8140}
8141
8142
8143/*
8144 * PCMPEQD / VPCMPEQD.
8145 */
8146#ifdef IEM_WITHOUT_ASSEMBLY
8147
8148IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8149{
8150 RTUINT64U uSrc1 = { *puDst };
8151 RTUINT64U uSrc2 = { *puSrc };
8152 RTUINT64U uDst;
8153 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8154 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8155 *puDst = uDst.u;
8156}
8157
8158
8159IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8160{
8161 RTUINT128U uSrc1 = *puDst;
8162 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8163 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8164 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8165 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8166}
8167
8168#endif /* IEM_WITHOUT_ASSEMBLY */
8169
8170IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8171{
8172 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8173 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8174 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8175 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8176}
8177
8178IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8179{
8180 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8181 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8182 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8183 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8184 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8185 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8186 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8187 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8188}
8189
8190
8191/*
8192 * PCMPEQQ / VPCMPEQQ.
8193 */
8194IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8195{
8196 RTUINT128U uSrc1 = *puDst;
8197 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8198 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8199}
8200
8201IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8202{
8203 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8204 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8205}
8206
8207IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8208{
8209 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8210 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8211 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8212 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8213}
8214
8215
8216/*
8217 * PCMPGTB / VPCMPGTB
8218 */
8219#ifdef IEM_WITHOUT_ASSEMBLY
8220
8221IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8222{
8223 RTUINT64U uSrc1 = { *puDst };
8224 RTUINT64U uSrc2 = { *puSrc };
8225 RTUINT64U uDst;
8226 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8227 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8228 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8229 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8230 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8231 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8232 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8233 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8234 *puDst = uDst.u;
8235}
8236
8237
8238IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8239{
8240 RTUINT128U uSrc1 = *puDst;
8241 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8242 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8243 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8244 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8245 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8246 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8247 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8248 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8249 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8250 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8251 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8252 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8253 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8254 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8255 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8256 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8257}
8258
8259#endif
8260
8261IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8262{
8263 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8264 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8265 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8266 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8267 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8268 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8269 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8270 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8271 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8272 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8273 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8274 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8275 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8276 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8277 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8278 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8279}
8280
8281IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8282{
8283 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8284 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8285 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8286 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8287 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8288 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8289 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8290 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8291 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8292 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8293 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8294 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8295 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8296 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8297 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8298 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8299 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8300 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8301 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8302 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8303 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8304 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8305 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8306 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8307 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8308 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8309 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8310 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8311 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8312 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8313 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8314 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8315}
8316
8317
8318/*
8319 * PCMPGTW / VPCMPGTW
8320 */
8321#ifdef IEM_WITHOUT_ASSEMBLY
8322
8323IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8324{
8325 RTUINT64U uSrc1 = { *puDst };
8326 RTUINT64U uSrc2 = { *puSrc };
8327 RTUINT64U uDst;
8328 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8329 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8330 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8331 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8332 *puDst = uDst.u;
8333}
8334
8335
8336IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8337{
8338 RTUINT128U uSrc1 = *puDst;
8339 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8340 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8341 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8342 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8343 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8344 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8345 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8346 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8347}
8348
8349#endif
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8352{
8353 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8354 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8355 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8356 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8357 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8358 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8359 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8360 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8361}
8362
8363IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8364{
8365 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8366 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8367 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8368 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8369 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8370 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8371 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8372 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8373 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8374 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8375 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8376 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8377 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8378 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8379 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8380 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8381}
8382
8383
8384/*
8385 * PCMPGTD / VPCMPGTD.
8386 */
8387#ifdef IEM_WITHOUT_ASSEMBLY
8388
8389IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8390{
8391 RTUINT64U uSrc1 = { *puDst };
8392 RTUINT64U uSrc2 = { *puSrc };
8393 RTUINT64U uDst;
8394 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8395 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8396 *puDst = uDst.u;
8397}
8398
8399
8400IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8401{
8402 RTUINT128U uSrc1 = *puDst;
8403 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8404 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8405 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8406 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8407}
8408
8409#endif /* IEM_WITHOUT_ASSEMBLY */
8410
8411IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8412{
8413 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8414 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8415 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8416 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8417}
8418
8419IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8420{
8421 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8422 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8423 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8424 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8425 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8426 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8427 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8428 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8429}
8430
8431
8432/*
8433 * PCMPGTQ / VPCMPGTQ.
8434 */
8435IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8436{
8437 RTUINT128U uSrc1 = *puDst;
8438 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8439 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8440}
8441
8442IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8443{
8444 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8445 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8446}
8447
8448IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8449{
8450 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8451 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8452 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8453 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8454}
8455
8456
8457/*
8458 * PADDB / VPADDB
8459 */
8460#ifdef IEM_WITHOUT_ASSEMBLY
8461
8462IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8463{
8464 RTUINT64U uSrc1 = { *puDst };
8465 RTUINT64U uSrc2 = { *puSrc };
8466 RTUINT64U uDst;
8467 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8468 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8469 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8470 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8471 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8472 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8473 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8474 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8475 *puDst = uDst.u;
8476}
8477
8478
8479IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8480{
8481 RTUINT128U uSrc1 = *puDst;
8482 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8483 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8484 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8485 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8486 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8487 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8488 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8489 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8490 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8491 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8492 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8493 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8494 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8495 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8496 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8497 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8498}
8499
8500#endif
8501
8502
8503IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8504{
8505 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8506 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8507 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8508 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8509 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8510 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8511 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8512 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8513 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8514 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8515 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8516 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8517 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8518 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8519 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8520 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8521}
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8524{
8525 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8526 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8527 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8528 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8529 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8530 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8531 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8532 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8533 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8534 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8535 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8536 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8537 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8538 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8539 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8540 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8541 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8542 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8543 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8544 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8545 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8546 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8547 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8548 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8549 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8550 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8551 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8552 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8553 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8554 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8555 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8556 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8557}
8558
8559
8560/*
8561 * PADDSB / VPADDSB
8562 */
8563#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8564 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8565 ? (uint8_t)(a_iWord) \
8566 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8567
8568#ifdef IEM_WITHOUT_ASSEMBLY
8569
8570IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8571{
8572 RTUINT64U uSrc1 = { *puDst };
8573 RTUINT64U uSrc2 = { *puSrc };
8574 RTUINT64U uDst;
8575 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8576 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8577 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8578 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8579 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8580 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8581 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8582 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8583 *puDst = uDst.u;
8584}
8585
8586
8587IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8588{
8589 RTUINT128U uSrc1 = *puDst;
8590 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8591 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8592 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8593 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8594 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8595 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8596 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8597 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8598 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8599 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8600 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8601 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8602 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8603 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8604 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8605 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8606}
8607
8608#endif
8609
8610IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u128_fallback,(PRTUINT128U puDst,
8611 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8612{
8613 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8614 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8615 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8616 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8617 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8618 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8619 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8620 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8621 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8622 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8623 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8624 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8625 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8626 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8627 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8628 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8629}
8630
8631IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsb_u256_fallback,(PRTUINT256U puDst,
8632 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8633{
8634 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] + puSrc2->ai8[0]);
8635 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] + puSrc2->ai8[1]);
8636 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] + puSrc2->ai8[2]);
8637 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] + puSrc2->ai8[3]);
8638 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] + puSrc2->ai8[4]);
8639 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] + puSrc2->ai8[5]);
8640 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] + puSrc2->ai8[6]);
8641 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] + puSrc2->ai8[7]);
8642 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] + puSrc2->ai8[8]);
8643 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] + puSrc2->ai8[9]);
8644 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] + puSrc2->ai8[10]);
8645 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] + puSrc2->ai8[11]);
8646 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] + puSrc2->ai8[12]);
8647 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] + puSrc2->ai8[13]);
8648 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] + puSrc2->ai8[14]);
8649 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] + puSrc2->ai8[15]);
8650 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] + puSrc2->ai8[16]);
8651 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] + puSrc2->ai8[17]);
8652 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] + puSrc2->ai8[18]);
8653 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] + puSrc2->ai8[19]);
8654 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] + puSrc2->ai8[20]);
8655 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] + puSrc2->ai8[21]);
8656 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] + puSrc2->ai8[22]);
8657 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] + puSrc2->ai8[23]);
8658 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] + puSrc2->ai8[24]);
8659 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] + puSrc2->ai8[25]);
8660 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] + puSrc2->ai8[26]);
8661 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] + puSrc2->ai8[27]);
8662 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] + puSrc2->ai8[28]);
8663 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] + puSrc2->ai8[29]);
8664 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] + puSrc2->ai8[30]);
8665 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] + puSrc2->ai8[31]);
8666}
8667
8668
8669/*
8670 * PADDUSB / VPADDUSB
8671 */
8672#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8673 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8674 ? (uint8_t)(a_uWord) \
8675 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8676
8677#ifdef IEM_WITHOUT_ASSEMBLY
8678
8679IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
8680{
8681 RTUINT64U uSrc1 = { *puDst };
8682 RTUINT64U uSrc2 = { *puSrc };
8683 RTUINT64U uDst;
8684 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8685 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8686 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8687 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8688 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8689 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8690 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8691 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8692 *puDst = uDst.u;
8693}
8694
8695
8696IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8697{
8698 RTUINT128U uSrc1 = *puDst;
8699 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8700 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8701 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8702 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8703 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8704 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8705 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8706 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8707 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8708 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8709 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8710 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8711 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8712 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8713 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8714 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8715}
8716
8717#endif
8718
8719IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8720 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8721{
8722 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8723 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8724 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8725 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8726 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8727 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8728 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8729 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8730 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8731 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8732 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8733 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8734 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8735 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8736 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8737 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8738}
8739
8740IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8741 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8742{
8743 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8744 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8745 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8746 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8747 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8748 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8749 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8750 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8751 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8752 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8753 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8754 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8755 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8756 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8757 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8758 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8759 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8760 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8761 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8762 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8763 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8764 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8765 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8766 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8767 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8768 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8769 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8770 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8771 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8772 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8773 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8774 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8775}
8776
8777
8778/*
8779 * PADDW / VPADDW
8780 */
8781#ifdef IEM_WITHOUT_ASSEMBLY
8782
8783IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8784{
8785 RTUINT64U uSrc1 = { *puDst };
8786 RTUINT64U uSrc2 = { *puSrc };
8787 RTUINT64U uDst;
8788 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8789 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8790 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8791 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8792 *puDst = uDst.u;
8793}
8794
8795
8796IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8797{
8798 RTUINT128U uSrc1 = *puDst;
8799 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8800 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8801 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8802 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8803 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8804 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8805 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8806 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8807}
8808
8809#endif
8810
8811
8812IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8813{
8814 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8815 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8816 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8817 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8818 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8819 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8820 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8821 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8822}
8823
8824IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8825{
8826 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8827 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8828 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8829 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8830 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8831 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8832 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8833 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8834 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8835 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8836 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8837 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8838 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8839 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8840 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8841 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8842}
8843
8844
8845/*
8846 * PADDSW / VPADDSW
8847 */
8848#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8849 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8850 ? (uint16_t)(a_iDword) \
8851 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8852
8853#ifdef IEM_WITHOUT_ASSEMBLY
8854
8855IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8856{
8857 RTUINT64U uSrc1 = { *puDst };
8858 RTUINT64U uSrc2 = { *puSrc };
8859 RTUINT64U uDst;
8860 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8861 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8862 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8863 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8864 *puDst = uDst.u;
8865}
8866
8867
8868IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8869{
8870 RTUINT128U uSrc1 = *puDst;
8871 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8872 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8873 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8874 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8875 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8876 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8877 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8878 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8879}
8880
8881#endif
8882
8883IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u128_fallback,(PRTUINT128U puDst,
8884 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8885{
8886 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8887 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8888 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8889 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8890 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8891 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8892 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8893 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8894}
8895
8896IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddsw_u256_fallback,(PRTUINT256U puDst,
8897 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8898{
8899 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc2->ai16[0]);
8900 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] + puSrc2->ai16[1]);
8901 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc2->ai16[2]);
8902 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] + puSrc2->ai16[3]);
8903 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc2->ai16[4]);
8904 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] + puSrc2->ai16[5]);
8905 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc2->ai16[6]);
8906 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] + puSrc2->ai16[7]);
8907 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] + puSrc2->ai16[8]);
8908 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] + puSrc2->ai16[9]);
8909 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc2->ai16[10]);
8910 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] + puSrc2->ai16[11]);
8911 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc2->ai16[12]);
8912 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] + puSrc2->ai16[13]);
8913 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc2->ai16[14]);
8914 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] + puSrc2->ai16[15]);
8915}
8916
8917
8918/*
8919 * PADDUSW / VPADDUSW
8920 */
8921#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8922 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8923 ? (uint16_t)(a_uDword) \
8924 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8925
8926#ifdef IEM_WITHOUT_ASSEMBLY
8927
8928IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8929{
8930 RTUINT64U uSrc1 = { *puDst };
8931 RTUINT64U uSrc2 = { *puSrc };
8932 RTUINT64U uDst;
8933 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8934 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8935 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8936 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8937 *puDst = uDst.u;
8938}
8939
8940
8941IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8942{
8943 RTUINT128U uSrc1 = *puDst;
8944 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8945 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8946 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8947 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8948 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8949 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8950 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8951 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8952}
8953
8954#endif
8955
8956IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8957 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8958{
8959 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8960 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8961 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8962 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8963 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8964 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8965 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8966 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8967}
8968
8969IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8970 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8971{
8972 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8973 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8974 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8975 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8976 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8977 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8978 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8979 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8980 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
8981 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
8982 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
8983 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
8984 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
8985 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
8986 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
8987 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
8988}
8989
8990
8991/*
8992 * PADDD / VPADDD.
8993 */
8994#ifdef IEM_WITHOUT_ASSEMBLY
8995
8996IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(uint64_t *puDst, uint64_t const *puSrc))
8997{
8998 RTUINT64U uSrc1 = { *puDst };
8999 RTUINT64U uSrc2 = { *puSrc };
9000 RTUINT64U uDst;
9001 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
9002 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
9003 *puDst = uDst.u;
9004}
9005
9006
9007IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9008{
9009 RTUINT128U uSrc1 = *puDst;
9010 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
9011 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
9012 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
9013 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
9014}
9015
9016#endif /* IEM_WITHOUT_ASSEMBLY */
9017
9018IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9019{
9020 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9021 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9022 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9023 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9024}
9025
9026IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9027{
9028 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
9029 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
9030 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
9031 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
9032 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
9033 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
9034 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
9035 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
9036}
9037
9038
9039/*
9040 * PADDQ / VPADDQ.
9041 */
9042#ifdef IEM_WITHOUT_ASSEMBLY
9043
9044IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9045{
9046 *puDst = *puDst + *puSrc;
9047}
9048
9049IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9050{
9051 RTUINT128U uSrc1 = *puDst;
9052 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
9053 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
9054}
9055
9056#endif
9057
9058IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9059{
9060 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9061 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9062}
9063
9064IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9065{
9066 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
9067 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
9068 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
9069 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
9070}
9071
9072
9073/*
9074 * PSUBB / VPSUBB
9075 */
9076#ifdef IEM_WITHOUT_ASSEMBLY
9077
9078IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9079{
9080 RTUINT64U uSrc1 = { *puDst };
9081 RTUINT64U uSrc2 = { *puSrc };
9082 RTUINT64U uDst;
9083 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
9084 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
9085 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
9086 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
9087 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
9088 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
9089 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
9090 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
9091 *puDst = uDst.u;
9092}
9093
9094
9095IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9096{
9097 RTUINT128U uSrc1 = *puDst;
9098 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
9099 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
9100 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
9101 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
9102 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
9103 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
9104 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
9105 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
9106 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
9107 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
9108 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
9109 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
9110 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
9111 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
9112 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
9113 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
9114}
9115
9116#endif
9117
9118IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9119{
9120 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9121 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9122 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9123 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9124 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9125 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9126 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9127 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9128 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9129 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9130 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9131 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9132 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9133 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9134 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9135 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9136}
9137
9138IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9139{
9140 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9141 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9142 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9143 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9144 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9145 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9146 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9147 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9148 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9149 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9150 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9151 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9152 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9153 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9154 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9155 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9156 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9157 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9158 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9159 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9160 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9161 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9162 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9163 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9164 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9165 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9166 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9167 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9168 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9169 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9170 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9171 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9172}
9173
9174
9175/*
9176 * PSUBSB / VSUBSB
9177 */
9178#ifdef IEM_WITHOUT_ASSEMBLY
9179
9180IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9181{
9182 RTUINT64U uSrc1 = { *puDst };
9183 RTUINT64U uSrc2 = { *puSrc };
9184 RTUINT64U uDst;
9185 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9186 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9187 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9188 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9189 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9190 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9191 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9192 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9193 *puDst = uDst.u;
9194}
9195
9196
9197IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9198{
9199 RTUINT128U uSrc1 = *puDst;
9200 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9201 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9202 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9203 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9204 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9205 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9206 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9207 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9208 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9209 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9210 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9211 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9212 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9213 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9214 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9215 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9216}
9217
9218#endif
9219
9220IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9221 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9222{
9223 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9224 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9225 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9226 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9227 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9228 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9229 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9230 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9231 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9232 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9233 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9234 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9235 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9236 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9237 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9238 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9239}
9240
9241IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9242 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9243{
9244 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9245 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9246 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9247 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9248 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9249 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9250 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9251 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9252 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9253 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9254 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9255 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9256 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9257 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9258 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9259 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9260 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9261 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9262 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9263 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9264 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9265 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9266 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9267 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9268 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9269 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9270 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9271 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9272 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9273 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9274 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9275 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9276}
9277
9278
9279/*
9280 * PSUBUSB / VPSUBUSW
9281 */
9282#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9283 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9284 ? (uint8_t)(a_uWord) \
9285 : (uint8_t)0 )
9286
9287#ifdef IEM_WITHOUT_ASSEMBLY
9288
9289IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(uint64_t *puDst, uint64_t const *puSrc))
9290{
9291 RTUINT64U uSrc1 = { *puDst };
9292 RTUINT64U uSrc2 = { *puSrc };
9293 RTUINT64U uDst;
9294 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9295 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9296 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9297 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9298 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9299 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9300 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9301 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9302 *puDst = uDst.u;
9303}
9304
9305
9306IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9307{
9308 RTUINT128U uSrc1 = *puDst;
9309 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9310 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9311 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9312 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9313 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9314 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9315 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9316 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9317 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9318 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9319 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9320 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9321 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9322 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9323 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9324 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9325}
9326
9327#endif
9328
9329IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9330 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9331{
9332 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9333 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9334 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9335 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9336 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9337 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9338 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9339 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9340 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9341 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9342 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9343 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9344 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9345 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9346 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9347 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9348}
9349
9350IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9351 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9352{
9353 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9354 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9355 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9356 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9357 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9358 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9359 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9360 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9361 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9362 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9363 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9364 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9365 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9366 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9367 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9368 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9369 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9370 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9371 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9372 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9373 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9374 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9375 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9376 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9377 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9378 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9379 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9380 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9381 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9382 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9383 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9384 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9385}
9386
9387
9388/*
9389 * PSUBW / VPSUBW
9390 */
9391#ifdef IEM_WITHOUT_ASSEMBLY
9392
9393IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9394{
9395 RTUINT64U uSrc1 = { *puDst };
9396 RTUINT64U uSrc2 = { *puSrc };
9397 RTUINT64U uDst;
9398 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9399 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9400 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9401 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9402 *puDst = uDst.u;
9403}
9404
9405
9406IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9407{
9408 RTUINT128U uSrc1 = *puDst;
9409 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9410 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9411 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9412 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9413 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9414 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9415 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9416 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9417}
9418
9419#endif
9420
9421IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9422{
9423 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9424 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9425 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9426 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9427 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9428 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9429 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9430 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9431}
9432
9433IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9434{
9435 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9436 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9437 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9438 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9439 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9440 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9441 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9442 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9443 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9444 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9445 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9446 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9447 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9448 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9449 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9450 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9451}
9452
9453
9454/*
9455 * PSUBSW / VPSUBSW
9456 */
9457#ifdef IEM_WITHOUT_ASSEMBLY
9458
9459IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9460{
9461 RTUINT64U uSrc1 = { *puDst };
9462 RTUINT64U uSrc2 = { *puSrc };
9463 RTUINT64U uDst;
9464 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9465 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9466 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9467 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9468 *puDst = uDst.u;
9469}
9470
9471
9472IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9473{
9474 RTUINT128U uSrc1 = *puDst;
9475 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9476 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9477 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9478 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9479 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9480 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9481 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9482 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9483}
9484
9485#endif
9486
9487IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9488 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9489{
9490 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9491 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9492 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9493 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9494 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9495 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9496 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9497 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9498}
9499
9500IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9501 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9502{
9503 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9504 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9505 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9506 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9507 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9508 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9509 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9510 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9511 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9512 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9513 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9514 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9515 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9516 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9517 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9518 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9519}
9520
9521
9522/*
9523 * PSUBUSW / VPSUBUSW
9524 */
9525#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9526 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9527 ? (uint16_t)(a_uDword) \
9528 : (uint16_t)0 )
9529
9530#ifdef IEM_WITHOUT_ASSEMBLY
9531
9532IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9533{
9534 RTUINT64U uSrc1 = { *puDst };
9535 RTUINT64U uSrc2 = { *puSrc };
9536 RTUINT64U uDst;
9537 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9538 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9539 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9540 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9541 *puDst = uDst.u;
9542}
9543
9544
9545IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9546{
9547 RTUINT128U uSrc1 = *puDst;
9548 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9549 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9550 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9551 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9552 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9553 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9554 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9555 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9556}
9557
9558#endif
9559
9560IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9561 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9562{
9563 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9564 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9565 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9566 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9567 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9568 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9569 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9570 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9571}
9572
9573IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9574 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9575{
9576 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9577 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9578 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9579 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9580 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9581 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9582 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9583 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9584 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9585 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9586 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9587 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9588 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9589 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9590 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9591 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9592}
9593
9594
9595
9596/*
9597 * PSUBD / VPSUBD.
9598 */
9599#ifdef IEM_WITHOUT_ASSEMBLY
9600
9601IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(uint64_t *puDst, uint64_t const *puSrc))
9602{
9603 RTUINT64U uSrc1 = { *puDst };
9604 RTUINT64U uSrc2 = { *puSrc };
9605 RTUINT64U uDst;
9606 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9607 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9608 *puDst = uDst.u;
9609}
9610
9611
9612IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9613{
9614 RTUINT128U uSrc1 = *puDst;
9615 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9616 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9617 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9618 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9619}
9620
9621#endif /* IEM_WITHOUT_ASSEMBLY */
9622
9623IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9624{
9625 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9626 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9627 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9628 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9629}
9630
9631IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9632{
9633 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9634 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9635 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9636 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9637 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9638 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9639 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9640 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9641}
9642
9643
9644/*
9645 * PSUBQ / VPSUBQ.
9646 */
9647#ifdef IEM_WITHOUT_ASSEMBLY
9648
9649IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9650{
9651 *puDst = *puDst - *puSrc;
9652}
9653
9654IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9655{
9656 RTUINT128U uSrc1 = *puDst;
9657 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9658 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9659}
9660
9661#endif
9662
9663IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9664{
9665 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9666 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9667}
9668
9669IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9670{
9671 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9672 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9673 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9674 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9675}
9676
9677
9678
9679/*
9680 * PMULLW / VPMULLW / PMULLD / VPMULLD
9681 */
9682#ifdef IEM_WITHOUT_ASSEMBLY
9683
9684IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9685{
9686 RTUINT64U uSrc1 = { *puDst };
9687 RTUINT64U uSrc2 = { *puSrc };
9688 RTUINT64U uDst;
9689 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9690 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9691 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9692 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9693 *puDst = uDst.u;
9694}
9695
9696
9697IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9698{
9699 RTUINT128U uSrc1 = *puDst;
9700 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9701 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9702 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9703 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9704 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9705 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9706 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9707 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9708}
9709
9710#endif
9711
9712IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9713{
9714 RTUINT128U uSrc1 = *puDst;
9715
9716 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9717 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9718 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9719 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9720}
9721
9722
9723IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9724{
9725 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9726 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9727 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9728 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9729 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9730 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9731 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9732 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9733}
9734
9735
9736IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9737{
9738 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9739 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9740 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9741 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9742 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9743 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9744 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9745 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9746 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9747 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9748 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9749 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9750 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9751 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9752 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9753 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9754}
9755
9756
9757IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9758{
9759 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9760 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9761 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9762 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9763}
9764
9765
9766IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9767{
9768 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9769 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9770 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9771 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9772 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9773 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9774 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9775 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9776}
9777
9778
9779/*
9780 * PMULHW / VPMULHW
9781 */
9782#ifdef IEM_WITHOUT_ASSEMBLY
9783
9784IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9785{
9786 RTUINT64U uSrc1 = { *puDst };
9787 RTUINT64U uSrc2 = { *puSrc };
9788 RTUINT64U uDst;
9789 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9790 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9791 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9792 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9793 *puDst = uDst.u;
9794}
9795
9796
9797IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9798{
9799 RTUINT128U uSrc1 = *puDst;
9800 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9801 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9802 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9803 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9804 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9805 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9806 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9807 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9808}
9809
9810#endif
9811
9812IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9813{
9814 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9815 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9816 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9817 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9818 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9819 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9820 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9821 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9822}
9823
9824
9825IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9826{
9827 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9828 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9829 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9830 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9831 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9832 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9833 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9834 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9835 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9836 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9837 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9838 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9839 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9840 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9841 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9842 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9843}
9844
9845
9846/*
9847 * PMULHUW / VPMULHUW
9848 */
9849#ifdef IEM_WITHOUT_ASSEMBLY
9850
9851IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9852{
9853 RTUINT64U uSrc1 = { *puDst };
9854 RTUINT64U uSrc2 = { *puSrc };
9855 RTUINT64U uDst;
9856 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9857 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9858 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9859 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9860 *puDst = uDst.u;
9861}
9862
9863
9864IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9865{
9866 RTUINT128U uSrc1 = *puDst;
9867 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9868 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9869 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9870 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9871 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9872 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9873 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9874 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9875}
9876
9877#endif
9878
9879IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9880{
9881 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9882 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9883 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9884 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9885 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9886 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9887 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9888 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9889}
9890
9891
9892IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9893{
9894 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9895 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9896 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9897 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9898 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9899 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9900 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9901 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9902 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9903 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9904 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9905 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9906 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9907 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9908 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9909 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9910}
9911
9912
9913/*
9914 * PSRLW / VPSRLW
9915 */
9916#ifdef IEM_WITHOUT_ASSEMBLY
9917
9918IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9919{
9920 RTUINT64U uSrc1 = { *puDst };
9921 RTUINT64U uSrc2 = { *puSrc };
9922 RTUINT64U uDst;
9923
9924 if (uSrc2.au64[0] <= 15)
9925 {
9926 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9927 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9928 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9929 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9930 }
9931 else
9932 {
9933 uDst.au64[0] = 0;
9934 }
9935 *puDst = uDst.u;
9936}
9937
9938
9939IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9940{
9941 RTUINT64U uSrc1 = { *puDst };
9942 RTUINT64U uDst;
9943
9944 if (uShift <= 15)
9945 {
9946 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9947 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9948 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9949 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9950 }
9951 else
9952 {
9953 uDst.au64[0] = 0;
9954 }
9955 *puDst = uDst.u;
9956}
9957
9958
9959IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9960{
9961 RTUINT128U uSrc1 = *puDst;
9962
9963 if (puSrc->au64[0] <= 15)
9964 {
9965 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9966 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9967 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9968 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9969 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9970 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9971 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9972 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9973 }
9974 else
9975 {
9976 puDst->au64[0] = 0;
9977 puDst->au64[1] = 0;
9978 }
9979}
9980
9981IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9982{
9983 RTUINT128U uSrc1 = *puDst;
9984
9985 if (uShift <= 15)
9986 {
9987 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9988 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9989 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9990 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9991 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9992 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9993 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9994 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9995 }
9996 else
9997 {
9998 puDst->au64[0] = 0;
9999 puDst->au64[1] = 0;
10000 }
10001}
10002
10003#endif
10004
10005IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10006{
10007 RTUINT128U uSrc1 = *puSrc1;
10008
10009 if (uShift <= 15)
10010 {
10011 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10012 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10013 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10014 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10015 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10016 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10017 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10018 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10019 }
10020 else
10021 {
10022 puDst->au64[0] = 0;
10023 puDst->au64[1] = 0;
10024 }
10025}
10026
10027IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10028{
10029 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10030}
10031
10032IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10033{
10034 iemAImpl_vpsrlw_imm_u128_fallback(puDst, puSrc1, uShift);
10035}
10036
10037IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10038{
10039 RTUINT256U uSrc1 = *puSrc1;
10040
10041 if (uShift <= 15)
10042 {
10043 puDst->au16[0] = uSrc1.au16[0] >> uShift;
10044 puDst->au16[1] = uSrc1.au16[1] >> uShift;
10045 puDst->au16[2] = uSrc1.au16[2] >> uShift;
10046 puDst->au16[3] = uSrc1.au16[3] >> uShift;
10047 puDst->au16[4] = uSrc1.au16[4] >> uShift;
10048 puDst->au16[5] = uSrc1.au16[5] >> uShift;
10049 puDst->au16[6] = uSrc1.au16[6] >> uShift;
10050 puDst->au16[7] = uSrc1.au16[7] >> uShift;
10051 puDst->au16[8] = uSrc1.au16[8] >> uShift;
10052 puDst->au16[9] = uSrc1.au16[9] >> uShift;
10053 puDst->au16[10] = uSrc1.au16[10] >> uShift;
10054 puDst->au16[11] = uSrc1.au16[11] >> uShift;
10055 puDst->au16[12] = uSrc1.au16[12] >> uShift;
10056 puDst->au16[13] = uSrc1.au16[13] >> uShift;
10057 puDst->au16[14] = uSrc1.au16[14] >> uShift;
10058 puDst->au16[15] = uSrc1.au16[15] >> uShift;
10059 }
10060 else
10061 {
10062 puDst->au64[0] = 0;
10063 puDst->au64[1] = 0;
10064 puDst->au64[2] = 0;
10065 puDst->au64[3] = 0;
10066 }
10067}
10068
10069IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10070{
10071 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, uShift);
10072}
10073
10074IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10075{
10076 iemAImpl_vpsrlw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10077}
10078
10079
10080/*
10081 * PSRAW / VPSRAW
10082 */
10083#ifdef IEM_WITHOUT_ASSEMBLY
10084
10085IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10086{
10087 RTUINT64U uSrc1 = { *puDst };
10088 RTUINT64U uSrc2 = { *puSrc };
10089 RTUINT64U uDst;
10090 uint8_t uShift;
10091
10092 uShift = RT_MIN(15, uSrc2.au64[0]);
10093
10094 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10095 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10096 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10097 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10098
10099 *puDst = uDst.u;
10100}
10101
10102
10103IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10104{
10105 RTUINT64U uSrc1 = { *puDst };
10106 RTUINT64U uDst;
10107
10108 uShift = RT_MIN(15, uShift);
10109
10110 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
10111 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
10112 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
10113 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
10114
10115 *puDst = uDst.u;
10116}
10117
10118
10119IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10120{
10121 RTUINT128U uSrc1 = *puDst;
10122 uint8_t uShift;
10123
10124 uShift = RT_MIN(15, puSrc->au64[0]);
10125
10126 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10127 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10128 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10129 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10130 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10131 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10132 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10133 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10134}
10135
10136IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10137{
10138 RTUINT128U uSrc1 = *puDst;
10139
10140 uShift = RT_MIN(15, uShift);
10141
10142 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10143 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10144 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10145 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10146 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10147 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10148 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10149 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10150}
10151
10152#endif
10153
10154IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10155{
10156 RTUINT128U uSrc1 = *puSrc1;
10157
10158 uShift = RT_MIN(15, uShift);
10159
10160 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10161 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10162 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10163 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10164 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10165 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10166 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10167 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10168}
10169
10170IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10171{
10172 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10173}
10174
10175IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10176{
10177 iemAImpl_vpsraw_imm_u128_fallback(puDst, puSrc1, uShift);
10178}
10179
10180IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10181{
10182 RTUINT256U uSrc1 = *puSrc1;
10183
10184 uShift = RT_MIN(15, uShift);
10185
10186 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
10187 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
10188 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
10189 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
10190 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
10191 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
10192 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
10193 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
10194 puDst->ai16[8] = uSrc1.ai16[8] >> uShift;
10195 puDst->ai16[9] = uSrc1.ai16[9] >> uShift;
10196 puDst->ai16[10] = uSrc1.ai16[10] >> uShift;
10197 puDst->ai16[11] = uSrc1.ai16[11] >> uShift;
10198 puDst->ai16[12] = uSrc1.ai16[12] >> uShift;
10199 puDst->ai16[13] = uSrc1.ai16[13] >> uShift;
10200 puDst->ai16[14] = uSrc1.ai16[14] >> uShift;
10201 puDst->ai16[15] = uSrc1.ai16[15] >> uShift;
10202}
10203
10204IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10205{
10206 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, uShift);
10207}
10208
10209IEM_DECL_IMPL_DEF(void, iemAImpl_vpsraw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10210{
10211 iemAImpl_vpsraw_imm_u256_fallback(puDst, puSrc1, RT_MIN(15, puSrc2->au64[0]));
10212}
10213
10214
10215/*
10216 * PSLLW / VPSLLW
10217 */
10218#ifdef IEM_WITHOUT_ASSEMBLY
10219
10220IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10221{
10222 RTUINT64U uSrc1 = { *puDst };
10223 RTUINT64U uSrc2 = { *puSrc };
10224 RTUINT64U uDst;
10225
10226 if (uSrc2.au64[0] <= 15)
10227 {
10228 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10229 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10230 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10231 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10232 }
10233 else
10234 {
10235 uDst.au64[0] = 0;
10236 }
10237 *puDst = uDst.u;
10238}
10239
10240
10241IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10242{
10243 RTUINT64U uSrc1 = { *puDst };
10244 RTUINT64U uDst;
10245
10246 if (uShift <= 15)
10247 {
10248 uDst.au16[0] = uSrc1.au16[0] << uShift;
10249 uDst.au16[1] = uSrc1.au16[1] << uShift;
10250 uDst.au16[2] = uSrc1.au16[2] << uShift;
10251 uDst.au16[3] = uSrc1.au16[3] << uShift;
10252 }
10253 else
10254 {
10255 uDst.au64[0] = 0;
10256 }
10257 *puDst = uDst.u;
10258}
10259
10260
10261IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10262{
10263 RTUINT128U uSrc1 = *puDst;
10264
10265 if (puSrc->au64[0] <= 15)
10266 {
10267 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10268 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10269 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10270 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10271 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10272 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10273 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10274 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10275 }
10276 else
10277 {
10278 puDst->au64[0] = 0;
10279 puDst->au64[1] = 0;
10280 }
10281}
10282
10283IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10284{
10285 RTUINT128U uSrc1 = *puDst;
10286
10287 if (uShift <= 15)
10288 {
10289 puDst->au16[0] = uSrc1.au16[0] << uShift;
10290 puDst->au16[1] = uSrc1.au16[1] << uShift;
10291 puDst->au16[2] = uSrc1.au16[2] << uShift;
10292 puDst->au16[3] = uSrc1.au16[3] << uShift;
10293 puDst->au16[4] = uSrc1.au16[4] << uShift;
10294 puDst->au16[5] = uSrc1.au16[5] << uShift;
10295 puDst->au16[6] = uSrc1.au16[6] << uShift;
10296 puDst->au16[7] = uSrc1.au16[7] << uShift;
10297 }
10298 else
10299 {
10300 puDst->au64[0] = 0;
10301 puDst->au64[1] = 0;
10302 }
10303}
10304
10305#endif
10306
10307IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10308{
10309 RTUINT128U uSrc1 = *puSrc1;
10310
10311 if (uShift <= 15)
10312 {
10313 puDst->au16[0] = uSrc1.au16[0] << uShift;
10314 puDst->au16[1] = uSrc1.au16[1] << uShift;
10315 puDst->au16[2] = uSrc1.au16[2] << uShift;
10316 puDst->au16[3] = uSrc1.au16[3] << uShift;
10317 puDst->au16[4] = uSrc1.au16[4] << uShift;
10318 puDst->au16[5] = uSrc1.au16[5] << uShift;
10319 puDst->au16[6] = uSrc1.au16[6] << uShift;
10320 puDst->au16[7] = uSrc1.au16[7] << uShift;
10321 }
10322 else
10323 {
10324 puDst->au64[0] = 0;
10325 puDst->au64[1] = 0;
10326 }
10327}
10328
10329IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10330{
10331 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10332}
10333
10334IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10335{
10336 iemAImpl_vpsllw_imm_u128_fallback(puDst, puSrc1, uShift);
10337}
10338
10339IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10340{
10341 RTUINT256U uSrc1 = *puSrc1;
10342
10343 if (uShift <= 15)
10344 {
10345 puDst->au16[0] = uSrc1.au16[0] << uShift;
10346 puDst->au16[1] = uSrc1.au16[1] << uShift;
10347 puDst->au16[2] = uSrc1.au16[2] << uShift;
10348 puDst->au16[3] = uSrc1.au16[3] << uShift;
10349 puDst->au16[4] = uSrc1.au16[4] << uShift;
10350 puDst->au16[5] = uSrc1.au16[5] << uShift;
10351 puDst->au16[6] = uSrc1.au16[6] << uShift;
10352 puDst->au16[7] = uSrc1.au16[7] << uShift;
10353 puDst->au16[8] = uSrc1.au16[8] << uShift;
10354 puDst->au16[9] = uSrc1.au16[9] << uShift;
10355 puDst->au16[10] = uSrc1.au16[10] << uShift;
10356 puDst->au16[11] = uSrc1.au16[11] << uShift;
10357 puDst->au16[12] = uSrc1.au16[12] << uShift;
10358 puDst->au16[13] = uSrc1.au16[13] << uShift;
10359 puDst->au16[14] = uSrc1.au16[14] << uShift;
10360 puDst->au16[15] = uSrc1.au16[15] << uShift;
10361 }
10362 else
10363 {
10364 puDst->au64[0] = 0;
10365 puDst->au64[1] = 0;
10366 puDst->au64[2] = 0;
10367 puDst->au64[3] = 0;
10368 }
10369}
10370
10371IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10372{
10373 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, RT_MIN(16, puSrc2->au64[0]));
10374}
10375
10376IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllw_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10377{
10378 iemAImpl_vpsllw_imm_u256_fallback(puDst, puSrc1, uShift);
10379}
10380
10381/*
10382 * PSRLD / VPSRLD
10383 */
10384#ifdef IEM_WITHOUT_ASSEMBLY
10385
10386IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10387{
10388 RTUINT64U uSrc1 = { *puDst };
10389 RTUINT64U uSrc2 = { *puSrc };
10390 RTUINT64U uDst;
10391
10392 if (uSrc2.au64[0] <= 31)
10393 {
10394 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10395 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10396 }
10397 else
10398 {
10399 uDst.au64[0] = 0;
10400 }
10401 *puDst = uDst.u;
10402}
10403
10404
10405IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10406{
10407 RTUINT64U uSrc1 = { *puDst };
10408 RTUINT64U uDst;
10409
10410 if (uShift <= 31)
10411 {
10412 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10413 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10414 }
10415 else
10416 {
10417 uDst.au64[0] = 0;
10418 }
10419 *puDst = uDst.u;
10420}
10421
10422
10423IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10424{
10425 RTUINT128U uSrc1 = *puDst;
10426
10427 if (puSrc->au64[0] <= 31)
10428 {
10429 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10430 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10431 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10432 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10433 }
10434 else
10435 {
10436 puDst->au64[0] = 0;
10437 puDst->au64[1] = 0;
10438 }
10439}
10440
10441IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10442{
10443 RTUINT128U uSrc1 = *puDst;
10444
10445 if (uShift <= 31)
10446 {
10447 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10448 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10449 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10450 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10451 }
10452 else
10453 {
10454 puDst->au64[0] = 0;
10455 puDst->au64[1] = 0;
10456 }
10457}
10458
10459#endif
10460
10461IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10462{
10463 RTUINT128U uSrc1 = *puSrc1;
10464
10465 if (uShift <= 31)
10466 {
10467 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10468 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10469 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10470 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10471 }
10472 else
10473 {
10474 puDst->au64[0] = 0;
10475 puDst->au64[1] = 0;
10476 }
10477}
10478
10479IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10480{
10481 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, uShift);
10482}
10483
10484IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10485{
10486 iemAImpl_vpsrld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10487}
10488
10489IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10490{
10491 RTUINT256U uSrc1 = *puSrc1;
10492
10493 if (uShift <= 31)
10494 {
10495 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10496 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10497 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10498 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10499 puDst->au32[4] = uSrc1.au32[4] >> uShift;
10500 puDst->au32[5] = uSrc1.au32[5] >> uShift;
10501 puDst->au32[6] = uSrc1.au32[6] >> uShift;
10502 puDst->au32[7] = uSrc1.au32[7] >> uShift;
10503 }
10504 else
10505 {
10506 puDst->au64[0] = 0;
10507 puDst->au64[1] = 0;
10508 puDst->au64[2] = 0;
10509 puDst->au64[3] = 0;
10510 }
10511}
10512
10513IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10514{
10515 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10516}
10517
10518IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10519{
10520 iemAImpl_vpsrld_imm_u256_fallback(puDst, puSrc1, uShift);
10521}
10522
10523
10524/*
10525 * PSRAD / VPSRAD
10526 */
10527#ifdef IEM_WITHOUT_ASSEMBLY
10528
10529IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10530{
10531 RTUINT64U uSrc1 = { *puDst };
10532 RTUINT64U uSrc2 = { *puSrc };
10533 RTUINT64U uDst;
10534 uint8_t uShift;
10535
10536 uShift = RT_MIN(31, uSrc2.au64[0]);
10537
10538 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10539 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10540
10541 *puDst = uDst.u;
10542}
10543
10544
10545IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10546{
10547 RTUINT64U uSrc1 = { *puDst };
10548 RTUINT64U uDst;
10549
10550 uShift = RT_MIN(31, uShift);
10551
10552 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10553 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10554
10555 *puDst = uDst.u;
10556}
10557
10558
10559IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10560{
10561 RTUINT128U uSrc1 = *puDst;
10562 uint8_t uShift;
10563
10564 uShift = RT_MIN(31, puSrc->au64[0]);
10565
10566 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10567 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10568 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10569 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10570}
10571
10572IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10573{
10574 RTUINT128U uSrc1 = *puDst;
10575
10576 uShift = RT_MIN(31, uShift);
10577
10578 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10579 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10580 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10581 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10582}
10583
10584#endif
10585
10586IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10587{
10588 RTUINT128U uSrc1 = *puSrc1;
10589
10590 uShift = RT_MIN(31, uShift);
10591
10592 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10593 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10594 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10595 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10596}
10597
10598IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10599{
10600 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, uShift);
10601}
10602
10603IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10604{
10605 iemAImpl_vpsrad_imm_u128_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10606}
10607
10608IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10609{
10610 RTUINT256U uSrc1 = *puSrc1;
10611
10612 uShift = RT_MIN(31, uShift);
10613
10614 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10615 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10616 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10617 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10618 puDst->ai32[4] = uSrc1.ai32[4] >> uShift;
10619 puDst->ai32[5] = uSrc1.ai32[5] >> uShift;
10620 puDst->ai32[6] = uSrc1.ai32[6] >> uShift;
10621 puDst->ai32[7] = uSrc1.ai32[7] >> uShift;
10622}
10623
10624IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10625{
10626 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, RT_MIN(31, puSrc2->au64[0]));
10627}
10628
10629IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrad_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10630{
10631 iemAImpl_vpsrad_imm_u256_fallback(puDst, puSrc1, uShift);
10632}
10633
10634
10635/*
10636 * PSLLD / VPSLLD
10637 */
10638#ifdef IEM_WITHOUT_ASSEMBLY
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10641{
10642 RTUINT64U uSrc1 = { *puDst };
10643 RTUINT64U uSrc2 = { *puSrc };
10644 RTUINT64U uDst;
10645
10646 if (uSrc2.au64[0] <= 31)
10647 {
10648 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10649 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10650 }
10651 else
10652 {
10653 uDst.au64[0] = 0;
10654 }
10655 *puDst = uDst.u;
10656}
10657
10658
10659IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10660{
10661 RTUINT64U uSrc1 = { *puDst };
10662 RTUINT64U uDst;
10663
10664 if (uShift <= 31)
10665 {
10666 uDst.au32[0] = uSrc1.au32[0] << uShift;
10667 uDst.au32[1] = uSrc1.au32[1] << uShift;
10668 }
10669 else
10670 {
10671 uDst.au64[0] = 0;
10672 }
10673 *puDst = uDst.u;
10674}
10675
10676
10677IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10678{
10679 RTUINT128U uSrc1 = *puDst;
10680
10681 if (puSrc->au64[0] <= 31)
10682 {
10683 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10684 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10685 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10686 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10687 }
10688 else
10689 {
10690 puDst->au64[0] = 0;
10691 puDst->au64[1] = 0;
10692 }
10693}
10694
10695IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10696{
10697 RTUINT128U uSrc1 = *puDst;
10698
10699 if (uShift <= 31)
10700 {
10701 puDst->au32[0] = uSrc1.au32[0] << uShift;
10702 puDst->au32[1] = uSrc1.au32[1] << uShift;
10703 puDst->au32[2] = uSrc1.au32[2] << uShift;
10704 puDst->au32[3] = uSrc1.au32[3] << uShift;
10705 }
10706 else
10707 {
10708 puDst->au64[0] = 0;
10709 puDst->au64[1] = 0;
10710 }
10711}
10712
10713#endif
10714
10715IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10716{
10717 RTUINT128U uSrc1 = *puSrc1;
10718
10719 if (uShift <= 31)
10720 {
10721 puDst->au32[0] = uSrc1.au32[0] << uShift;
10722 puDst->au32[1] = uSrc1.au32[1] << uShift;
10723 puDst->au32[2] = uSrc1.au32[2] << uShift;
10724 puDst->au32[3] = uSrc1.au32[3] << uShift;
10725 }
10726 else
10727 {
10728 puDst->au64[0] = 0;
10729 puDst->au64[1] = 0;
10730 }
10731}
10732
10733IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10734{
10735 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, uShift);
10736}
10737
10738IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10739{
10740 iemAImpl_vpslld_imm_u128_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10741}
10742
10743IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10744{
10745 RTUINT256U uSrc1 = *puSrc1;
10746
10747 if (uShift <= 31)
10748 {
10749 puDst->au32[0] = uSrc1.au32[0] << uShift;
10750 puDst->au32[1] = uSrc1.au32[1] << uShift;
10751 puDst->au32[2] = uSrc1.au32[2] << uShift;
10752 puDst->au32[3] = uSrc1.au32[3] << uShift;
10753 puDst->au32[4] = uSrc1.au32[4] << uShift;
10754 puDst->au32[5] = uSrc1.au32[5] << uShift;
10755 puDst->au32[6] = uSrc1.au32[6] << uShift;
10756 puDst->au32[7] = uSrc1.au32[7] << uShift;
10757 }
10758 else
10759 {
10760 puDst->au64[0] = 0;
10761 puDst->au64[1] = 0;
10762 puDst->au64[2] = 0;
10763 puDst->au64[3] = 0;
10764 }
10765}
10766
10767IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10768{
10769 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, RT_MIN(32, puSrc2->au64[0]));
10770}
10771
10772IEM_DECL_IMPL_DEF(void, iemAImpl_vpslld_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10773{
10774 iemAImpl_vpslld_imm_u256_fallback(puDst, puSrc1, uShift);
10775}
10776
10777
10778/*
10779 * PSRLQ / VPSRLQ
10780 */
10781#ifdef IEM_WITHOUT_ASSEMBLY
10782
10783IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10784{
10785 RTUINT64U uSrc1 = { *puDst };
10786 RTUINT64U uSrc2 = { *puSrc };
10787 RTUINT64U uDst;
10788
10789 if (uSrc2.au64[0] <= 63)
10790 {
10791 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10792 }
10793 else
10794 {
10795 uDst.au64[0] = 0;
10796 }
10797 *puDst = uDst.u;
10798}
10799
10800
10801IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10802{
10803 RTUINT64U uSrc1 = { *puDst };
10804 RTUINT64U uDst;
10805
10806 if (uShift <= 63)
10807 {
10808 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10809 }
10810 else
10811 {
10812 uDst.au64[0] = 0;
10813 }
10814 *puDst = uDst.u;
10815}
10816
10817
10818IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10819{
10820 RTUINT128U uSrc1 = *puDst;
10821
10822 if (puSrc->au64[0] <= 63)
10823 {
10824 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10825 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10826 }
10827 else
10828 {
10829 puDst->au64[0] = 0;
10830 puDst->au64[1] = 0;
10831 }
10832}
10833
10834IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10835{
10836 RTUINT128U uSrc1 = *puDst;
10837
10838 if (uShift <= 63)
10839 {
10840 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10841 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10842 }
10843 else
10844 {
10845 puDst->au64[0] = 0;
10846 puDst->au64[1] = 0;
10847 }
10848}
10849
10850#endif
10851
10852IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10853{
10854 RTUINT128U uSrc1 = *puSrc1;
10855
10856 if (uShift <= 63)
10857 {
10858 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10859 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10860 }
10861 else
10862 {
10863 puDst->au64[0] = 0;
10864 puDst->au64[1] = 0;
10865 }
10866}
10867
10868IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10869{
10870 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, uShift);
10871}
10872
10873IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10874{
10875 iemAImpl_vpsrlq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10876}
10877
10878IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10879{
10880 RTUINT256U uSrc1 = *puSrc1;
10881
10882 if (uShift <= 63)
10883 {
10884 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10885 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10886 puDst->au64[2] = uSrc1.au64[2] >> uShift;
10887 puDst->au64[3] = uSrc1.au64[3] >> uShift;
10888 }
10889 else
10890 {
10891 puDst->au64[0] = 0;
10892 puDst->au64[1] = 0;
10893 puDst->au64[2] = 0;
10894 puDst->au64[3] = 0;
10895 }
10896}
10897
10898IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10899{
10900 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
10901}
10902
10903IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
10904{
10905 iemAImpl_vpsrlq_imm_u256_fallback(puDst, puSrc1, uShift);
10906}
10907
10908
10909/*
10910 * PSLLQ / VPSLLQ
10911 */
10912#ifdef IEM_WITHOUT_ASSEMBLY
10913
10914IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10915{
10916 RTUINT64U uSrc1 = { *puDst };
10917 RTUINT64U uSrc2 = { *puSrc };
10918 RTUINT64U uDst;
10919
10920 if (uSrc2.au64[0] <= 63)
10921 {
10922 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10923 }
10924 else
10925 {
10926 uDst.au64[0] = 0;
10927 }
10928 *puDst = uDst.u;
10929}
10930
10931
10932IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10933{
10934 RTUINT64U uSrc1 = { *puDst };
10935 RTUINT64U uDst;
10936
10937 if (uShift <= 63)
10938 {
10939 uDst.au64[0] = uSrc1.au64[0] << uShift;
10940 }
10941 else
10942 {
10943 uDst.au64[0] = 0;
10944 }
10945 *puDst = uDst.u;
10946}
10947
10948
10949IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10950{
10951 RTUINT128U uSrc1 = *puDst;
10952
10953 if (puSrc->au64[0] <= 63)
10954 {
10955 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10956 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10957 }
10958 else
10959 {
10960 puDst->au64[0] = 0;
10961 puDst->au64[1] = 0;
10962 }
10963}
10964
10965IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10966{
10967 RTUINT128U uSrc1 = *puDst;
10968
10969 if (uShift <= 63)
10970 {
10971 puDst->au64[0] = uSrc1.au64[0] << uShift;
10972 puDst->au64[1] = uSrc1.au64[1] << uShift;
10973 }
10974 else
10975 {
10976 puDst->au64[0] = 0;
10977 puDst->au64[1] = 0;
10978 }
10979}
10980
10981#endif
10982
10983IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
10984{
10985 RTUINT128U uSrc1 = *puSrc1;
10986
10987 if (uShift <= 63)
10988 {
10989 puDst->au64[0] = uSrc1.au64[0] << uShift;
10990 puDst->au64[1] = uSrc1.au64[1] << uShift;
10991 }
10992 else
10993 {
10994 puDst->au64[0] = 0;
10995 puDst->au64[1] = 0;
10996 }
10997}
10998
10999IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11000{
11001 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11002}
11003
11004IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, uint8_t uShift))
11005{
11006 iemAImpl_vpsllq_imm_u128_fallback(puDst, puSrc1, uShift);
11007}
11008
11009IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11010{
11011 RTUINT256U uSrc1 = *puSrc1;
11012
11013 if (uShift <= 63)
11014 {
11015 puDst->au64[0] = uSrc1.au64[0] << uShift;
11016 puDst->au64[1] = uSrc1.au64[1] << uShift;
11017 puDst->au64[2] = uSrc1.au64[2] << uShift;
11018 puDst->au64[3] = uSrc1.au64[3] << uShift;
11019 }
11020 else
11021 {
11022 puDst->au64[0] = 0;
11023 puDst->au64[1] = 0;
11024 puDst->au64[2] = 0;
11025 puDst->au64[3] = 0;
11026 }
11027}
11028
11029IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11030{
11031 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, RT_MIN(64, puSrc2->au64[0]));
11032}
11033
11034IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, uint8_t uShift))
11035{
11036 iemAImpl_vpsllq_imm_u256_fallback(puDst, puSrc1, uShift);
11037}
11038
11039
11040/*
11041 * PSRLDQ / VPSRLDQ
11042 */
11043#ifdef IEM_WITHOUT_ASSEMBLY
11044
11045IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11046{
11047 if (uShift < 16)
11048 {
11049 RTUINT128U uSrc1 = *puDst;
11050 int i;
11051
11052 for (i = 0; i < 16 - uShift; ++i)
11053 puDst->au8[i] = uSrc1.au8[i + uShift];
11054 for (i = 16 - uShift; i < 16; ++i)
11055 puDst->au8[i] = 0;
11056 }
11057 else
11058 {
11059 puDst->au64[0] = 0;
11060 puDst->au64[1] = 0;
11061 }
11062}
11063
11064IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11065{
11066 if (uShift < 16)
11067 {
11068 RTUINT128U uSrc1 = *puSrc;
11069 int i;
11070
11071 for (i = 0; i < 16 - uShift; ++i)
11072 puDst->au8[i] = uSrc1.au8[i + uShift];
11073 for (i = 16 - uShift; i < 16; ++i)
11074 puDst->au8[i] = 0;
11075 }
11076 else
11077 {
11078 puDst->au64[0] = 0;
11079 puDst->au64[1] = 0;
11080 }
11081}
11082
11083IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11084{
11085 iemAImpl_vpsrldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11086 iemAImpl_vpsrldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11087}
11088#endif
11089
11090IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11091{
11092 if (uShift < 16)
11093 {
11094 RTUINT128U uSrc1 = *puSrc;
11095 int i;
11096
11097 for (i = 0; i < 16 - uShift; ++i)
11098 puDst->au8[i] = uSrc1.au8[i + uShift];
11099 for (i = 16 - uShift; i < 16; ++i)
11100 puDst->au8[i] = 0;
11101 }
11102 else
11103 {
11104 puDst->au64[0] = 0;
11105 puDst->au64[1] = 0;
11106 }
11107}
11108
11109IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11110{
11111 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11112 iemAImpl_vpsrldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11113}
11114
11115
11116/*
11117 * PSLLDQ / VPSLLDQ
11118 */
11119#ifdef IEM_WITHOUT_ASSEMBLY
11120
11121IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
11122{
11123 if (uShift < 16)
11124 {
11125 RTUINT128U uSrc1 = *puDst;
11126 int i;
11127
11128 for (i = 0; i < uShift; ++i)
11129 puDst->au8[i] = 0;
11130 for (i = uShift; i < 16; ++i)
11131 puDst->au8[i] = uSrc1.au8[i - uShift];
11132 }
11133 else
11134 {
11135 puDst->au64[0] = 0;
11136 puDst->au64[1] = 0;
11137 }
11138}
11139
11140IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11141{
11142 if (uShift < 16)
11143 {
11144 RTUINT128U uSrc1 = *puSrc;
11145 int i;
11146
11147 for (i = 0; i < uShift; ++i)
11148 puDst->au8[i] = 0;
11149 for (i = uShift; i < 16; ++i)
11150 puDst->au8[i] = uSrc1.au8[i - uShift];
11151 }
11152 else
11153 {
11154 puDst->au64[0] = 0;
11155 puDst->au64[1] = 0;
11156 }
11157}
11158
11159IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11160{
11161 iemAImpl_vpslldq_imm_u128(&puDst->au128[0], &puSrc->au128[0], uShift);
11162 iemAImpl_vpslldq_imm_u128(&puDst->au128[1], &puSrc->au128[1], uShift);
11163}
11164
11165#endif
11166
11167IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t uShift))
11168{
11169 if (uShift < 16)
11170 {
11171 RTUINT128U uSrc1 = *puSrc;
11172 int i;
11173
11174 for (i = 0; i < uShift; ++i)
11175 puDst->au8[i] = 0;
11176 for (i = uShift; i < 16; ++i)
11177 puDst->au8[i] = uSrc1.au8[i - uShift];
11178 }
11179 else
11180 {
11181 puDst->au64[0] = 0;
11182 puDst->au64[1] = 0;
11183 }
11184}
11185
11186IEM_DECL_IMPL_DEF(void, iemAImpl_vpslldq_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t uShift))
11187{
11188 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[0], &puSrc->au128[0], uShift);
11189 iemAImpl_vpslldq_imm_u128_fallback(&puDst->au128[1], &puSrc->au128[1], uShift);
11190}
11191
11192
11193/*
11194 * VPSRLVD
11195 */
11196IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11197{
11198 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11199 {
11200 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11201 }
11202}
11203
11204IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11205{
11206 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11207 {
11208 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] >> puSrc2->au8[uU32 << 2];
11209 }
11210}
11211
11212
11213/*
11214 * VPSRAVD
11215 */
11216IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11217{
11218 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11219 {
11220 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11221 }
11222}
11223
11224IEM_DECL_IMPL_DEF(void, iemAImpl_vpsravd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11225{
11226 for (uint8_t uI32 = 0; uI32 < RT_ELEMENTS(puDst->ai32); ++uI32)
11227 {
11228 puDst->ai32[uI32] = (puSrc2->au32[uI32] > 31) ? 0 : puSrc1->ai32[uI32] >> puSrc2->au8[uI32 << 2];
11229 }
11230}
11231
11232
11233/*
11234 * VPSLLVD
11235 */
11236IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11237{
11238 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11239 {
11240 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11241 }
11242}
11243
11244IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11245{
11246 for (uint8_t uU32 = 0; uU32 < RT_ELEMENTS(puDst->au32); ++uU32)
11247 {
11248 puDst->au32[uU32] = (puSrc2->au32[uU32] > 31) ? 0 : puSrc1->au32[uU32] << puSrc2->au8[uU32 << 2];
11249 }
11250}
11251
11252
11253/*
11254 * VPSRLVQ
11255 */
11256IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11257{
11258 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11259 {
11260 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11261 }
11262}
11263
11264IEM_DECL_IMPL_DEF(void, iemAImpl_vpsrlvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11265{
11266 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11267 {
11268 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] >> puSrc2->au8[uU64 << 3];
11269 }
11270}
11271
11272
11273/*
11274 * VPSLLVQ
11275 */
11276IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11277{
11278 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11279 {
11280 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11281 }
11282}
11283
11284IEM_DECL_IMPL_DEF(void, iemAImpl_vpsllvq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11285{
11286 for (uint8_t uU64 = 0; uU64 < RT_ELEMENTS(puDst->au64); ++uU64)
11287 {
11288 puDst->au64[uU64] = (puSrc2->au64[uU64] > 63) ? 0 : puSrc1->au64[uU64] << puSrc2->au8[uU64 << 3];
11289 }
11290}
11291
11292
11293/*
11294 * PMADDWD / VPMADDWD
11295 */
11296#ifdef IEM_WITHOUT_ASSEMBLY
11297
11298IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11299{
11300 RTUINT64U uSrc1 = { *puDst };
11301 RTUINT64U uSrc2 = { *puSrc };
11302 RTUINT64U uDst;
11303
11304 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11305 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11306 *puDst = uDst.u;
11307}
11308
11309
11310IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11311{
11312 RTUINT128U uSrc1 = *puDst;
11313
11314 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11315 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11316 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11317 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11318}
11319
11320#endif
11321
11322
11323IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
11324{
11325 RTUINT64U uSrc1 = { *puDst };
11326 RTUINT64U uSrc2 = { *puSrc };
11327 RTUINT64U uDst;
11328
11329 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
11330 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
11331 *puDst = uDst.u;
11332}
11333
11334
11335IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11336{
11337 RTUINT128U uSrc1 = *puDst;
11338
11339 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
11340 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
11341 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
11342 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
11343}
11344
11345
11346IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11347{
11348 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11349 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11350 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11351 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11352}
11353
11354
11355IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11356{
11357 puDst->ai32[0] = (int32_t)puSrc1->ai16[0] * puSrc2->ai16[0] + (int32_t)puSrc1->ai16[1] * puSrc2->ai16[1];
11358 puDst->ai32[1] = (int32_t)puSrc1->ai16[2] * puSrc2->ai16[2] + (int32_t)puSrc1->ai16[3] * puSrc2->ai16[3];
11359 puDst->ai32[2] = (int32_t)puSrc1->ai16[4] * puSrc2->ai16[4] + (int32_t)puSrc1->ai16[5] * puSrc2->ai16[5];
11360 puDst->ai32[3] = (int32_t)puSrc1->ai16[6] * puSrc2->ai16[6] + (int32_t)puSrc1->ai16[7] * puSrc2->ai16[7];
11361 puDst->ai32[4] = (int32_t)puSrc1->ai16[8] * puSrc2->ai16[8] + (int32_t)puSrc1->ai16[9] * puSrc2->ai16[9];
11362 puDst->ai32[5] = (int32_t)puSrc1->ai16[10] * puSrc2->ai16[10] + (int32_t)puSrc1->ai16[11] * puSrc2->ai16[11];
11363 puDst->ai32[6] = (int32_t)puSrc1->ai16[12] * puSrc2->ai16[12] + (int32_t)puSrc1->ai16[13] * puSrc2->ai16[13];
11364 puDst->ai32[7] = (int32_t)puSrc1->ai16[14] * puSrc2->ai16[14] + (int32_t)puSrc1->ai16[15] * puSrc2->ai16[15];
11365}
11366
11367
11368/*
11369 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
11370 */
11371#ifdef IEM_WITHOUT_ASSEMBLY
11372
11373IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11374{
11375 RTUINT64U uSrc1 = { *puDst };
11376 RTUINT64U uSrc2 = { *puSrc };
11377 RTUINT64U uDst;
11378
11379 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
11380 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
11381 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
11382 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
11383 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
11384 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
11385 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
11386 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
11387 *puDst = uDst.u;
11388}
11389
11390
11391IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11392{
11393 RTUINT128U uSrc1 = *puDst;
11394
11395 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
11396 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
11397 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
11398 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
11399 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
11400 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
11401 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
11402 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
11403 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
11404 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
11405 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
11406 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
11407 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
11408 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
11409 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
11410 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
11411}
11412
11413#endif
11414
11415
11416IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11417{
11418 RTUINT128U uSrc1 = *puDst;
11419
11420 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
11421 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
11422 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
11423 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
11424 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
11425 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
11426 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
11427 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
11428}
11429
11430
11431IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11432{
11433 RTUINT128U uSrc1 = *puDst;
11434
11435 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
11436 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
11437 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
11438 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
11439}
11440
11441
11442IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11443{
11444 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11445 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11446 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11447 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11448 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11449 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11450 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11451 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11452 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11453 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11454 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11455 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11456 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11457 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11458 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11459 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11460}
11461
11462
11463IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11464{
11465 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11466 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11467 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11468 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11469 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11470 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11471 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11472 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11473 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11474 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11475 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
11476 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
11477 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
11478 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
11479 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
11480 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
11481 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
11482 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
11483 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
11484 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
11485 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
11486 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
11487 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
11488 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
11489 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
11490 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
11491 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
11492 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
11493 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
11494 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
11495 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
11496 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
11497}
11498
11499
11500IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11501{
11502 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11503 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11504 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11505 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11506 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11507 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11508 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11509 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11510}
11511
11512
11513IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11514{
11515 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11516 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11517 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11518 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11519 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11520 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11521 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11522 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11523 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11524 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11525 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
11526 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
11527 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
11528 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
11529 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
11530 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
11531}
11532
11533
11534IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11535{
11536 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11537 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11538 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11539 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11540}
11541
11542
11543IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11544{
11545 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11546 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11547 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11548 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11549 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11550 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11551 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11552 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11553}
11554
11555
11556/*
11557 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
11558 */
11559#ifdef IEM_WITHOUT_ASSEMBLY
11560
11561IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11562{
11563 RTUINT64U uSrc1 = { *puDst };
11564 RTUINT64U uSrc2 = { *puSrc };
11565 RTUINT64U uDst;
11566
11567 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
11568 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
11569 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
11570 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
11571 *puDst = uDst.u;
11572}
11573
11574
11575IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11576{
11577 RTUINT128U uSrc1 = *puDst;
11578
11579 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11580 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11581 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11582 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11583 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11584 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11585 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11586 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11587}
11588
11589#endif
11590
11591IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11592{
11593 RTUINT128U uSrc1 = *puDst;
11594
11595 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11596 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11597 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11598 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11599 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11600 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11601 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11602 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11603 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11604 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11605 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
11606 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
11607 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
11608 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
11609 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
11610 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
11611}
11612
11613
11614IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11615{
11616 RTUINT128U uSrc1 = *puDst;
11617
11618 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11619 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11620 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11621 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11622}
11623
11624
11625IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11626{
11627 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11628 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11629 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11630 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11631 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11632 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11633 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11634 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11635 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11636 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11637 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11638 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11639 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11640 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11641 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11642 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11643}
11644
11645
11646IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11647{
11648 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11649 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11650 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11651 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11652 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11653 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11654 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11655 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11656 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11657 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11658 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
11659 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
11660 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
11661 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
11662 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
11663 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
11664 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
11665 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
11666 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
11667 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
11668 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
11669 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
11670 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
11671 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
11672 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
11673 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
11674 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
11675 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
11676 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
11677 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
11678 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
11679 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
11680}
11681
11682
11683IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11684{
11685 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11686 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11687 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11688 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11689 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11690 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11691 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11692 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11693}
11694
11695
11696IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11697{
11698 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11699 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11700 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11701 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11702 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11703 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11704 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11705 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11706 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11707 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11708 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
11709 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
11710 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
11711 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
11712 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
11713 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
11714}
11715
11716
11717IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11718{
11719 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11720 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11721 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11722 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11723}
11724
11725
11726IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11727{
11728 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11729 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11730 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11731 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11732 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11733 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11734 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11735 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11736}
11737
11738
11739/*
11740 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
11741 */
11742#ifdef IEM_WITHOUT_ASSEMBLY
11743
11744IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(uint64_t *puDst, uint64_t const *puSrc))
11745{
11746 RTUINT64U uSrc1 = { *puDst };
11747 RTUINT64U uSrc2 = { *puSrc };
11748 RTUINT64U uDst;
11749
11750 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
11751 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
11752 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
11753 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
11754 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
11755 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
11756 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
11757 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
11758 *puDst = uDst.u;
11759}
11760
11761
11762IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11763{
11764 RTUINT128U uSrc1 = *puDst;
11765
11766 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
11767 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
11768 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
11769 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
11770 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
11771 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11772 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11773 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11774 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11775 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11776 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11777 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11778 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11779 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11780 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11781 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11782}
11783
11784#endif
11785
11786IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11787{
11788 RTUINT128U uSrc1 = *puDst;
11789
11790 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11791 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11792 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11793 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11794 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11795 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11796 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11797 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11798}
11799
11800
11801IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11802{
11803 RTUINT128U uSrc1 = *puDst;
11804
11805 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11806 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11807 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11808 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11809}
11810
11811
11812IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11813{
11814 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11815 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11816 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11817 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11818 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11819 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11820 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11821 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11822 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11823 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11824 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11825 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11826 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11827 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11828 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11829 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11830}
11831
11832
11833IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11834{
11835 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11836 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11837 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11838 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11839 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11840 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11841 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11842 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11843 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11844 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11845 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11846 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11847 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11848 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11849 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11850 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11851 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11852 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11853 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11854 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11855 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11856 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11857 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11858 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11859 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11860 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11861 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11862 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11863 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11864 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11865 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11866 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11867}
11868
11869
11870IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11871{
11872 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11873 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11874 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11875 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11876 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11877 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11878 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11879 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11880}
11881
11882
11883IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11884{
11885 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11886 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11887 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11888 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11889 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11890 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11891 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11892 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11893 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11894 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11895 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11896 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11897 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11898 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11899 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11900 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11901}
11902
11903
11904IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11905{
11906 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11907 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11908 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11909 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11910}
11911
11912
11913IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11914{
11915 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11916 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11917 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11918 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11919 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11920 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11921 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11922 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11923}
11924
11925
11926/*
11927 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11928 */
11929#ifdef IEM_WITHOUT_ASSEMBLY
11930
11931IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11932{
11933 RTUINT64U uSrc1 = { *puDst };
11934 RTUINT64U uSrc2 = { *puSrc };
11935 RTUINT64U uDst;
11936
11937 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11938 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11939 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11940 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11941 *puDst = uDst.u;
11942}
11943
11944
11945IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11946{
11947 RTUINT128U uSrc1 = *puDst;
11948
11949 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11950 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11951 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11952 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11953 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11954 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11955 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11956 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11957}
11958
11959#endif
11960
11961IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11962{
11963 RTUINT128U uSrc1 = *puDst;
11964
11965 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11966 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11967 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11968 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11969 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11970 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11971 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11972 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11973 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11974 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11975 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11976 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11977 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11978 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11979 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11980 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11981}
11982
11983
11984IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11985{
11986 RTUINT128U uSrc1 = *puDst;
11987
11988 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11989 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11990 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11991 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11992}
11993
11994
11995IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11996{
11997 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11998 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11999 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12000 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12001 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12002 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12003 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12004 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12005 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12006 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12007 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12008 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12009 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12010 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12011 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12012 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12013}
12014
12015
12016IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12017{
12018 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
12019 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
12020 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
12021 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
12022 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
12023 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
12024 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
12025 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
12026 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
12027 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
12028 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
12029 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
12030 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
12031 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
12032 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
12033 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
12034 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
12035 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
12036 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
12037 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
12038 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
12039 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
12040 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
12041 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
12042 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
12043 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
12044 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
12045 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
12046 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
12047 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
12048 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
12049 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
12050}
12051
12052
12053IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12054{
12055 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12056 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12057 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12058 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12059 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12060 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12061 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12062 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12063}
12064
12065
12066IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12067{
12068 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
12069 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
12070 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
12071 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
12072 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
12073 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
12074 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
12075 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
12076 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
12077 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
12078 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
12079 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
12080 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
12081 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
12082 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
12083 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
12084}
12085
12086
12087IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12088{
12089 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12090 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12091 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12092 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12093}
12094
12095
12096IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12097{
12098 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
12099 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
12100 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
12101 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
12102 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
12103 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
12104 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
12105 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
12106}
12107
12108
12109/*
12110 * PAVGB / VPAVGB / PAVGW / VPAVGW
12111 */
12112#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
12113#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
12114
12115#ifdef IEM_WITHOUT_ASSEMBLY
12116
12117IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12118{
12119 RTUINT64U uSrc1 = { *puDst };
12120 RTUINT64U uSrc2 = { *puSrc };
12121 RTUINT64U uDst;
12122
12123 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
12124 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
12125 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
12126 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
12127 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
12128 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
12129 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
12130 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
12131 *puDst = uDst.u;
12132}
12133
12134
12135IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12136{
12137 RTUINT128U uSrc1 = *puDst;
12138
12139 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12140 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12141 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12142 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12143 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12144 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12145 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12146 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12147 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12148 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12149 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12150 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12151 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12152 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12153 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12154 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12155}
12156
12157
12158IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12159{
12160 RTUINT64U uSrc1 = { *puDst };
12161 RTUINT64U uSrc2 = { *puSrc };
12162 RTUINT64U uDst;
12163
12164 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
12165 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
12166 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
12167 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
12168 *puDst = uDst.u;
12169}
12170
12171
12172IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12173{
12174 RTUINT128U uSrc1 = *puDst;
12175
12176 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
12177 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
12178 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
12179 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
12180 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
12181 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
12182 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
12183 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
12184}
12185
12186#endif
12187
12188IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12189{
12190 RTUINT128U uSrc1 = *puDst;
12191
12192 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12193 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12194 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12195 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12196 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12197 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12198 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12199 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12200 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12201 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12202 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12203 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12204 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12205 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12206 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12207 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12208}
12209
12210
12211IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12212{
12213 RTUINT128U uSrc1 = *puDst;
12214
12215 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
12216 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
12217 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
12218 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
12219 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
12220 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
12221 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
12222 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
12223 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
12224 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
12225 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
12226 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
12227 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
12228 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
12229 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
12230 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
12231}
12232
12233
12234IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12235{
12236 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12237 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12238 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12239 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12240 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12241 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12242 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12243 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12244 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12245 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12246 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12247 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12248 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12249 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12250 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12251 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12252}
12253
12254
12255IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12256{
12257 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
12258 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
12259 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
12260 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
12261 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
12262 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
12263 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
12264 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
12265 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
12266 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
12267 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
12268 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
12269 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
12270 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
12271 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
12272 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
12273 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
12274 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
12275 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
12276 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
12277 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
12278 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
12279 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
12280 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
12281 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
12282 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
12283 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
12284 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
12285 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
12286 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
12287 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
12288 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
12289}
12290
12291
12292IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12293{
12294 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12295 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12296 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12297 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12298 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12299 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12300 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12301 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12302}
12303
12304
12305IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12306{
12307 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
12308 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
12309 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
12310 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
12311 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
12312 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
12313 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
12314 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
12315 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
12316 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
12317 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
12318 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
12319 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
12320 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
12321 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
12322 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
12323}
12324
12325#undef PAVGB_EXEC
12326#undef PAVGW_EXEC
12327
12328
12329/*
12330 * PMOVMSKB / VPMOVMSKB
12331 */
12332#ifdef IEM_WITHOUT_ASSEMBLY
12333
12334IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
12335{
12336 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12337 uint64_t const uSrc = *pu64Src;
12338 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
12339 | ((uSrc >> (15-1)) & RT_BIT_64(1))
12340 | ((uSrc >> (23-2)) & RT_BIT_64(2))
12341 | ((uSrc >> (31-3)) & RT_BIT_64(3))
12342 | ((uSrc >> (39-4)) & RT_BIT_64(4))
12343 | ((uSrc >> (47-5)) & RT_BIT_64(5))
12344 | ((uSrc >> (55-6)) & RT_BIT_64(6))
12345 | ((uSrc >> (63-7)) & RT_BIT_64(7));
12346}
12347
12348
12349IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
12350{
12351 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12352 uint64_t const uSrc0 = pu128Src->QWords.qw0;
12353 uint64_t const uSrc1 = pu128Src->QWords.qw1;
12354 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12355 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12356 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12357 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12358 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12359 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12360 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12361 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12362 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12363 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12364 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12365 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12366 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12367 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12368 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12369 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
12370}
12371
12372#endif
12373
12374IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
12375{
12376 /* The the most signficant bit from each byte and store them in the given general purpose register. */
12377 uint64_t const uSrc0 = puSrc->QWords.qw0;
12378 uint64_t const uSrc1 = puSrc->QWords.qw1;
12379 uint64_t const uSrc2 = puSrc->QWords.qw2;
12380 uint64_t const uSrc3 = puSrc->QWords.qw3;
12381 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
12382 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
12383 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
12384 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
12385 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
12386 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
12387 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
12388 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
12389 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
12390 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
12391 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
12392 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
12393 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
12394 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
12395 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
12396 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
12397 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
12398 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
12399 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
12400 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
12401 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
12402 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
12403 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
12404 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
12405 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
12406 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
12407 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
12408 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
12409 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
12410 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
12411 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
12412 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
12413}
12414
12415
12416/*
12417 * [V]PSHUFB
12418 */
12419
12420IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
12421{
12422 RTUINT64U const uSrc = { *puSrc };
12423 RTUINT64U const uDstIn = { *puDst };
12424 ASMCompilerBarrier();
12425 RTUINT64U uDstOut = { 0 };
12426 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
12427 {
12428 uint8_t idxSrc = uSrc.au8[iByte];
12429 if (!(idxSrc & 0x80))
12430 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
12431 }
12432 *puDst = uDstOut.u;
12433}
12434
12435
12436IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12437{
12438 RTUINT128U const uSrc = *puSrc;
12439 RTUINT128U const uDstIn = *puDst;
12440 ASMCompilerBarrier();
12441 puDst->au64[0] = 0;
12442 puDst->au64[1] = 0;
12443 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12444 {
12445 uint8_t idxSrc = uSrc.au8[iByte];
12446 if (!(idxSrc & 0x80))
12447 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
12448 }
12449}
12450
12451
12452IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12453{
12454 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
12455 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
12456 ASMCompilerBarrier();
12457 puDst->au64[0] = 0;
12458 puDst->au64[1] = 0;
12459 for (unsigned iByte = 0; iByte < 16; iByte++)
12460 {
12461 uint8_t idxSrc = uSrc2.au8[iByte];
12462 if (!(idxSrc & 0x80))
12463 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12464 }
12465}
12466
12467
12468IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12469{
12470 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
12471 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
12472 ASMCompilerBarrier();
12473 puDst->au64[0] = 0;
12474 puDst->au64[1] = 0;
12475 puDst->au64[2] = 0;
12476 puDst->au64[3] = 0;
12477 for (unsigned iByte = 0; iByte < 16; iByte++)
12478 {
12479 uint8_t idxSrc = uSrc2.au8[iByte];
12480 if (!(idxSrc & 0x80))
12481 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
12482 }
12483 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
12484 {
12485 uint8_t idxSrc = uSrc2.au8[iByte];
12486 if (!(idxSrc & 0x80))
12487 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
12488 }
12489}
12490
12491
12492/*
12493 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
12494 */
12495#ifdef IEM_WITHOUT_ASSEMBLY
12496
12497IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
12498{
12499 uint64_t const uSrc = *puSrc;
12500 ASMCompilerBarrier();
12501 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12502 uSrc >> (((bEvil >> 2) & 3) * 16),
12503 uSrc >> (((bEvil >> 4) & 3) * 16),
12504 uSrc >> (((bEvil >> 6) & 3) * 16));
12505}
12506
12507
12508IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12509{
12510 puDst->QWords.qw0 = puSrc->QWords.qw0;
12511 uint64_t const uSrc = puSrc->QWords.qw1;
12512 ASMCompilerBarrier();
12513 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12514 uSrc >> (((bEvil >> 2) & 3) * 16),
12515 uSrc >> (((bEvil >> 4) & 3) * 16),
12516 uSrc >> (((bEvil >> 6) & 3) * 16));
12517}
12518
12519#endif
12520
12521IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12522{
12523 puDst->QWords.qw0 = puSrc->QWords.qw0;
12524 uint64_t const uSrc1 = puSrc->QWords.qw1;
12525 puDst->QWords.qw2 = puSrc->QWords.qw2;
12526 uint64_t const uSrc3 = puSrc->QWords.qw3;
12527 ASMCompilerBarrier();
12528 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
12529 uSrc1 >> (((bEvil >> 2) & 3) * 16),
12530 uSrc1 >> (((bEvil >> 4) & 3) * 16),
12531 uSrc1 >> (((bEvil >> 6) & 3) * 16));
12532 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
12533 uSrc3 >> (((bEvil >> 2) & 3) * 16),
12534 uSrc3 >> (((bEvil >> 4) & 3) * 16),
12535 uSrc3 >> (((bEvil >> 6) & 3) * 16));
12536}
12537
12538#ifdef IEM_WITHOUT_ASSEMBLY
12539IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12540{
12541 puDst->QWords.qw1 = puSrc->QWords.qw1;
12542 uint64_t const uSrc = puSrc->QWords.qw0;
12543 ASMCompilerBarrier();
12544 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
12545 uSrc >> (((bEvil >> 2) & 3) * 16),
12546 uSrc >> (((bEvil >> 4) & 3) * 16),
12547 uSrc >> (((bEvil >> 6) & 3) * 16));
12548
12549}
12550#endif
12551
12552
12553IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12554{
12555 puDst->QWords.qw3 = puSrc->QWords.qw3;
12556 uint64_t const uSrc2 = puSrc->QWords.qw2;
12557 puDst->QWords.qw1 = puSrc->QWords.qw1;
12558 uint64_t const uSrc0 = puSrc->QWords.qw0;
12559 ASMCompilerBarrier();
12560 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
12561 uSrc0 >> (((bEvil >> 2) & 3) * 16),
12562 uSrc0 >> (((bEvil >> 4) & 3) * 16),
12563 uSrc0 >> (((bEvil >> 6) & 3) * 16));
12564 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
12565 uSrc2 >> (((bEvil >> 2) & 3) * 16),
12566 uSrc2 >> (((bEvil >> 4) & 3) * 16),
12567 uSrc2 >> (((bEvil >> 6) & 3) * 16));
12568
12569}
12570
12571
12572#ifdef IEM_WITHOUT_ASSEMBLY
12573IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
12574{
12575 RTUINT128U const uSrc = *puSrc;
12576 ASMCompilerBarrier();
12577 puDst->au32[0] = uSrc.au32[bEvil & 3];
12578 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
12579 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
12580 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
12581}
12582#endif
12583
12584
12585IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
12586{
12587 RTUINT256U const uSrc = *puSrc;
12588 ASMCompilerBarrier();
12589 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
12590 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
12591 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
12592 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
12593 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
12594 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
12595 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
12596 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
12597}
12598
12599
12600/*
12601 * PUNPCKHBW - high bytes -> words
12602 */
12603#ifdef IEM_WITHOUT_ASSEMBLY
12604
12605IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12606{
12607 RTUINT64U const uSrc2 = { *puSrc };
12608 RTUINT64U const uSrc1 = { *puDst };
12609 ASMCompilerBarrier();
12610 RTUINT64U uDstOut;
12611 uDstOut.au8[0] = uSrc1.au8[4];
12612 uDstOut.au8[1] = uSrc2.au8[4];
12613 uDstOut.au8[2] = uSrc1.au8[5];
12614 uDstOut.au8[3] = uSrc2.au8[5];
12615 uDstOut.au8[4] = uSrc1.au8[6];
12616 uDstOut.au8[5] = uSrc2.au8[6];
12617 uDstOut.au8[6] = uSrc1.au8[7];
12618 uDstOut.au8[7] = uSrc2.au8[7];
12619 *puDst = uDstOut.u;
12620}
12621
12622
12623IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12624{
12625 RTUINT128U const uSrc2 = *puSrc;
12626 RTUINT128U const uSrc1 = *puDst;
12627 ASMCompilerBarrier();
12628 RTUINT128U uDstOut;
12629 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12630 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12631 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12632 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12633 uDstOut.au8[ 4] = uSrc1.au8[10];
12634 uDstOut.au8[ 5] = uSrc2.au8[10];
12635 uDstOut.au8[ 6] = uSrc1.au8[11];
12636 uDstOut.au8[ 7] = uSrc2.au8[11];
12637 uDstOut.au8[ 8] = uSrc1.au8[12];
12638 uDstOut.au8[ 9] = uSrc2.au8[12];
12639 uDstOut.au8[10] = uSrc1.au8[13];
12640 uDstOut.au8[11] = uSrc2.au8[13];
12641 uDstOut.au8[12] = uSrc1.au8[14];
12642 uDstOut.au8[13] = uSrc2.au8[14];
12643 uDstOut.au8[14] = uSrc1.au8[15];
12644 uDstOut.au8[15] = uSrc2.au8[15];
12645 *puDst = uDstOut;
12646}
12647
12648#endif
12649
12650IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12651{
12652 RTUINT128U const uSrc2 = *puSrc2;
12653 RTUINT128U const uSrc1 = *puSrc1;
12654 ASMCompilerBarrier();
12655 RTUINT128U uDstOut;
12656 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12657 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12658 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12659 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12660 uDstOut.au8[ 4] = uSrc1.au8[10];
12661 uDstOut.au8[ 5] = uSrc2.au8[10];
12662 uDstOut.au8[ 6] = uSrc1.au8[11];
12663 uDstOut.au8[ 7] = uSrc2.au8[11];
12664 uDstOut.au8[ 8] = uSrc1.au8[12];
12665 uDstOut.au8[ 9] = uSrc2.au8[12];
12666 uDstOut.au8[10] = uSrc1.au8[13];
12667 uDstOut.au8[11] = uSrc2.au8[13];
12668 uDstOut.au8[12] = uSrc1.au8[14];
12669 uDstOut.au8[13] = uSrc2.au8[14];
12670 uDstOut.au8[14] = uSrc1.au8[15];
12671 uDstOut.au8[15] = uSrc2.au8[15];
12672 *puDst = uDstOut;
12673}
12674
12675
12676IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12677{
12678 RTUINT256U const uSrc2 = *puSrc2;
12679 RTUINT256U const uSrc1 = *puSrc1;
12680 ASMCompilerBarrier();
12681 RTUINT256U uDstOut;
12682 uDstOut.au8[ 0] = uSrc1.au8[ 8];
12683 uDstOut.au8[ 1] = uSrc2.au8[ 8];
12684 uDstOut.au8[ 2] = uSrc1.au8[ 9];
12685 uDstOut.au8[ 3] = uSrc2.au8[ 9];
12686 uDstOut.au8[ 4] = uSrc1.au8[10];
12687 uDstOut.au8[ 5] = uSrc2.au8[10];
12688 uDstOut.au8[ 6] = uSrc1.au8[11];
12689 uDstOut.au8[ 7] = uSrc2.au8[11];
12690 uDstOut.au8[ 8] = uSrc1.au8[12];
12691 uDstOut.au8[ 9] = uSrc2.au8[12];
12692 uDstOut.au8[10] = uSrc1.au8[13];
12693 uDstOut.au8[11] = uSrc2.au8[13];
12694 uDstOut.au8[12] = uSrc1.au8[14];
12695 uDstOut.au8[13] = uSrc2.au8[14];
12696 uDstOut.au8[14] = uSrc1.au8[15];
12697 uDstOut.au8[15] = uSrc2.au8[15];
12698 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12699 uDstOut.au8[16] = uSrc1.au8[24];
12700 uDstOut.au8[17] = uSrc2.au8[24];
12701 uDstOut.au8[18] = uSrc1.au8[25];
12702 uDstOut.au8[19] = uSrc2.au8[25];
12703 uDstOut.au8[20] = uSrc1.au8[26];
12704 uDstOut.au8[21] = uSrc2.au8[26];
12705 uDstOut.au8[22] = uSrc1.au8[27];
12706 uDstOut.au8[23] = uSrc2.au8[27];
12707 uDstOut.au8[24] = uSrc1.au8[28];
12708 uDstOut.au8[25] = uSrc2.au8[28];
12709 uDstOut.au8[26] = uSrc1.au8[29];
12710 uDstOut.au8[27] = uSrc2.au8[29];
12711 uDstOut.au8[28] = uSrc1.au8[30];
12712 uDstOut.au8[29] = uSrc2.au8[30];
12713 uDstOut.au8[30] = uSrc1.au8[31];
12714 uDstOut.au8[31] = uSrc2.au8[31];
12715 *puDst = uDstOut;
12716}
12717
12718
12719/*
12720 * PUNPCKHBW - high words -> dwords
12721 */
12722#ifdef IEM_WITHOUT_ASSEMBLY
12723
12724IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12725{
12726 RTUINT64U const uSrc2 = { *puSrc };
12727 RTUINT64U const uSrc1 = { *puDst };
12728 ASMCompilerBarrier();
12729 RTUINT64U uDstOut;
12730 uDstOut.au16[0] = uSrc1.au16[2];
12731 uDstOut.au16[1] = uSrc2.au16[2];
12732 uDstOut.au16[2] = uSrc1.au16[3];
12733 uDstOut.au16[3] = uSrc2.au16[3];
12734 *puDst = uDstOut.u;
12735}
12736
12737
12738IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12739{
12740 RTUINT128U const uSrc2 = *puSrc;
12741 RTUINT128U const uSrc1 = *puDst;
12742 ASMCompilerBarrier();
12743 RTUINT128U uDstOut;
12744 uDstOut.au16[0] = uSrc1.au16[4];
12745 uDstOut.au16[1] = uSrc2.au16[4];
12746 uDstOut.au16[2] = uSrc1.au16[5];
12747 uDstOut.au16[3] = uSrc2.au16[5];
12748 uDstOut.au16[4] = uSrc1.au16[6];
12749 uDstOut.au16[5] = uSrc2.au16[6];
12750 uDstOut.au16[6] = uSrc1.au16[7];
12751 uDstOut.au16[7] = uSrc2.au16[7];
12752 *puDst = uDstOut;
12753}
12754
12755#endif
12756
12757IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12758{
12759 RTUINT128U const uSrc2 = *puSrc2;
12760 RTUINT128U const uSrc1 = *puSrc1;
12761 ASMCompilerBarrier();
12762 RTUINT128U uDstOut;
12763 uDstOut.au16[0] = uSrc1.au16[4];
12764 uDstOut.au16[1] = uSrc2.au16[4];
12765 uDstOut.au16[2] = uSrc1.au16[5];
12766 uDstOut.au16[3] = uSrc2.au16[5];
12767 uDstOut.au16[4] = uSrc1.au16[6];
12768 uDstOut.au16[5] = uSrc2.au16[6];
12769 uDstOut.au16[6] = uSrc1.au16[7];
12770 uDstOut.au16[7] = uSrc2.au16[7];
12771 *puDst = uDstOut;
12772}
12773
12774
12775IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12776{
12777 RTUINT256U const uSrc2 = *puSrc2;
12778 RTUINT256U const uSrc1 = *puSrc1;
12779 ASMCompilerBarrier();
12780 RTUINT256U uDstOut;
12781 uDstOut.au16[0] = uSrc1.au16[4];
12782 uDstOut.au16[1] = uSrc2.au16[4];
12783 uDstOut.au16[2] = uSrc1.au16[5];
12784 uDstOut.au16[3] = uSrc2.au16[5];
12785 uDstOut.au16[4] = uSrc1.au16[6];
12786 uDstOut.au16[5] = uSrc2.au16[6];
12787 uDstOut.au16[6] = uSrc1.au16[7];
12788 uDstOut.au16[7] = uSrc2.au16[7];
12789
12790 uDstOut.au16[8] = uSrc1.au16[12];
12791 uDstOut.au16[9] = uSrc2.au16[12];
12792 uDstOut.au16[10] = uSrc1.au16[13];
12793 uDstOut.au16[11] = uSrc2.au16[13];
12794 uDstOut.au16[12] = uSrc1.au16[14];
12795 uDstOut.au16[13] = uSrc2.au16[14];
12796 uDstOut.au16[14] = uSrc1.au16[15];
12797 uDstOut.au16[15] = uSrc2.au16[15];
12798 *puDst = uDstOut;
12799}
12800
12801
12802/*
12803 * PUNPCKHBW - high dwords -> qword(s)
12804 */
12805#ifdef IEM_WITHOUT_ASSEMBLY
12806
12807IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12808{
12809 RTUINT64U const uSrc2 = { *puSrc };
12810 RTUINT64U const uSrc1 = { *puDst };
12811 ASMCompilerBarrier();
12812 RTUINT64U uDstOut;
12813 uDstOut.au32[0] = uSrc1.au32[1];
12814 uDstOut.au32[1] = uSrc2.au32[1];
12815 *puDst = uDstOut.u;
12816}
12817
12818
12819IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12820{
12821 RTUINT128U const uSrc2 = *puSrc;
12822 RTUINT128U const uSrc1 = *puDst;
12823 ASMCompilerBarrier();
12824 RTUINT128U uDstOut;
12825 uDstOut.au32[0] = uSrc1.au32[2];
12826 uDstOut.au32[1] = uSrc2.au32[2];
12827 uDstOut.au32[2] = uSrc1.au32[3];
12828 uDstOut.au32[3] = uSrc2.au32[3];
12829 *puDst = uDstOut;
12830}
12831
12832#endif
12833
12834IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12835{
12836 RTUINT128U const uSrc2 = *puSrc2;
12837 RTUINT128U const uSrc1 = *puSrc1;
12838 ASMCompilerBarrier();
12839 RTUINT128U uDstOut;
12840 uDstOut.au32[0] = uSrc1.au32[2];
12841 uDstOut.au32[1] = uSrc2.au32[2];
12842 uDstOut.au32[2] = uSrc1.au32[3];
12843 uDstOut.au32[3] = uSrc2.au32[3];
12844 *puDst = uDstOut;
12845}
12846
12847
12848IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12849{
12850 RTUINT256U const uSrc2 = *puSrc2;
12851 RTUINT256U const uSrc1 = *puSrc1;
12852 ASMCompilerBarrier();
12853 RTUINT256U uDstOut;
12854 uDstOut.au32[0] = uSrc1.au32[2];
12855 uDstOut.au32[1] = uSrc2.au32[2];
12856 uDstOut.au32[2] = uSrc1.au32[3];
12857 uDstOut.au32[3] = uSrc2.au32[3];
12858
12859 uDstOut.au32[4] = uSrc1.au32[6];
12860 uDstOut.au32[5] = uSrc2.au32[6];
12861 uDstOut.au32[6] = uSrc1.au32[7];
12862 uDstOut.au32[7] = uSrc2.au32[7];
12863 *puDst = uDstOut;
12864}
12865
12866
12867/*
12868 * PUNPCKHQDQ -> High qwords -> double qword(s).
12869 */
12870#ifdef IEM_WITHOUT_ASSEMBLY
12871IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12872{
12873 RTUINT128U const uSrc2 = *puSrc;
12874 RTUINT128U const uSrc1 = *puDst;
12875 ASMCompilerBarrier();
12876 RTUINT128U uDstOut;
12877 uDstOut.au64[0] = uSrc1.au64[1];
12878 uDstOut.au64[1] = uSrc2.au64[1];
12879 *puDst = uDstOut;
12880}
12881#endif
12882
12883
12884IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12885{
12886 RTUINT128U const uSrc2 = *puSrc2;
12887 RTUINT128U const uSrc1 = *puSrc1;
12888 ASMCompilerBarrier();
12889 RTUINT128U uDstOut;
12890 uDstOut.au64[0] = uSrc1.au64[1];
12891 uDstOut.au64[1] = uSrc2.au64[1];
12892 *puDst = uDstOut;
12893}
12894
12895
12896IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12897{
12898 RTUINT256U const uSrc2 = *puSrc2;
12899 RTUINT256U const uSrc1 = *puSrc1;
12900 ASMCompilerBarrier();
12901 RTUINT256U uDstOut;
12902 uDstOut.au64[0] = uSrc1.au64[1];
12903 uDstOut.au64[1] = uSrc2.au64[1];
12904
12905 uDstOut.au64[2] = uSrc1.au64[3];
12906 uDstOut.au64[3] = uSrc2.au64[3];
12907 *puDst = uDstOut;
12908}
12909
12910
12911/*
12912 * PUNPCKLBW - low bytes -> words
12913 */
12914#ifdef IEM_WITHOUT_ASSEMBLY
12915
12916IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12917{
12918 RTUINT64U const uSrc2 = { *puSrc };
12919 RTUINT64U const uSrc1 = { *puDst };
12920 ASMCompilerBarrier();
12921 RTUINT64U uDstOut;
12922 uDstOut.au8[0] = uSrc1.au8[0];
12923 uDstOut.au8[1] = uSrc2.au8[0];
12924 uDstOut.au8[2] = uSrc1.au8[1];
12925 uDstOut.au8[3] = uSrc2.au8[1];
12926 uDstOut.au8[4] = uSrc1.au8[2];
12927 uDstOut.au8[5] = uSrc2.au8[2];
12928 uDstOut.au8[6] = uSrc1.au8[3];
12929 uDstOut.au8[7] = uSrc2.au8[3];
12930 *puDst = uDstOut.u;
12931}
12932
12933
12934IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12935{
12936 RTUINT128U const uSrc2 = *puSrc;
12937 RTUINT128U const uSrc1 = *puDst;
12938 ASMCompilerBarrier();
12939 RTUINT128U uDstOut;
12940 uDstOut.au8[ 0] = uSrc1.au8[0];
12941 uDstOut.au8[ 1] = uSrc2.au8[0];
12942 uDstOut.au8[ 2] = uSrc1.au8[1];
12943 uDstOut.au8[ 3] = uSrc2.au8[1];
12944 uDstOut.au8[ 4] = uSrc1.au8[2];
12945 uDstOut.au8[ 5] = uSrc2.au8[2];
12946 uDstOut.au8[ 6] = uSrc1.au8[3];
12947 uDstOut.au8[ 7] = uSrc2.au8[3];
12948 uDstOut.au8[ 8] = uSrc1.au8[4];
12949 uDstOut.au8[ 9] = uSrc2.au8[4];
12950 uDstOut.au8[10] = uSrc1.au8[5];
12951 uDstOut.au8[11] = uSrc2.au8[5];
12952 uDstOut.au8[12] = uSrc1.au8[6];
12953 uDstOut.au8[13] = uSrc2.au8[6];
12954 uDstOut.au8[14] = uSrc1.au8[7];
12955 uDstOut.au8[15] = uSrc2.au8[7];
12956 *puDst = uDstOut;
12957}
12958
12959#endif
12960
12961IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12962{
12963 RTUINT128U const uSrc2 = *puSrc2;
12964 RTUINT128U const uSrc1 = *puSrc1;
12965 ASMCompilerBarrier();
12966 RTUINT128U uDstOut;
12967 uDstOut.au8[ 0] = uSrc1.au8[0];
12968 uDstOut.au8[ 1] = uSrc2.au8[0];
12969 uDstOut.au8[ 2] = uSrc1.au8[1];
12970 uDstOut.au8[ 3] = uSrc2.au8[1];
12971 uDstOut.au8[ 4] = uSrc1.au8[2];
12972 uDstOut.au8[ 5] = uSrc2.au8[2];
12973 uDstOut.au8[ 6] = uSrc1.au8[3];
12974 uDstOut.au8[ 7] = uSrc2.au8[3];
12975 uDstOut.au8[ 8] = uSrc1.au8[4];
12976 uDstOut.au8[ 9] = uSrc2.au8[4];
12977 uDstOut.au8[10] = uSrc1.au8[5];
12978 uDstOut.au8[11] = uSrc2.au8[5];
12979 uDstOut.au8[12] = uSrc1.au8[6];
12980 uDstOut.au8[13] = uSrc2.au8[6];
12981 uDstOut.au8[14] = uSrc1.au8[7];
12982 uDstOut.au8[15] = uSrc2.au8[7];
12983 *puDst = uDstOut;
12984}
12985
12986
12987IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12988{
12989 RTUINT256U const uSrc2 = *puSrc2;
12990 RTUINT256U const uSrc1 = *puSrc1;
12991 ASMCompilerBarrier();
12992 RTUINT256U uDstOut;
12993 uDstOut.au8[ 0] = uSrc1.au8[0];
12994 uDstOut.au8[ 1] = uSrc2.au8[0];
12995 uDstOut.au8[ 2] = uSrc1.au8[1];
12996 uDstOut.au8[ 3] = uSrc2.au8[1];
12997 uDstOut.au8[ 4] = uSrc1.au8[2];
12998 uDstOut.au8[ 5] = uSrc2.au8[2];
12999 uDstOut.au8[ 6] = uSrc1.au8[3];
13000 uDstOut.au8[ 7] = uSrc2.au8[3];
13001 uDstOut.au8[ 8] = uSrc1.au8[4];
13002 uDstOut.au8[ 9] = uSrc2.au8[4];
13003 uDstOut.au8[10] = uSrc1.au8[5];
13004 uDstOut.au8[11] = uSrc2.au8[5];
13005 uDstOut.au8[12] = uSrc1.au8[6];
13006 uDstOut.au8[13] = uSrc2.au8[6];
13007 uDstOut.au8[14] = uSrc1.au8[7];
13008 uDstOut.au8[15] = uSrc2.au8[7];
13009 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
13010 uDstOut.au8[16] = uSrc1.au8[16];
13011 uDstOut.au8[17] = uSrc2.au8[16];
13012 uDstOut.au8[18] = uSrc1.au8[17];
13013 uDstOut.au8[19] = uSrc2.au8[17];
13014 uDstOut.au8[20] = uSrc1.au8[18];
13015 uDstOut.au8[21] = uSrc2.au8[18];
13016 uDstOut.au8[22] = uSrc1.au8[19];
13017 uDstOut.au8[23] = uSrc2.au8[19];
13018 uDstOut.au8[24] = uSrc1.au8[20];
13019 uDstOut.au8[25] = uSrc2.au8[20];
13020 uDstOut.au8[26] = uSrc1.au8[21];
13021 uDstOut.au8[27] = uSrc2.au8[21];
13022 uDstOut.au8[28] = uSrc1.au8[22];
13023 uDstOut.au8[29] = uSrc2.au8[22];
13024 uDstOut.au8[30] = uSrc1.au8[23];
13025 uDstOut.au8[31] = uSrc2.au8[23];
13026 *puDst = uDstOut;
13027}
13028
13029
13030/*
13031 * PUNPCKLBW - low words -> dwords
13032 */
13033#ifdef IEM_WITHOUT_ASSEMBLY
13034
13035IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
13036{
13037 RTUINT64U const uSrc2 = { *puSrc };
13038 RTUINT64U const uSrc1 = { *puDst };
13039 ASMCompilerBarrier();
13040 RTUINT64U uDstOut;
13041 uDstOut.au16[0] = uSrc1.au16[0];
13042 uDstOut.au16[1] = uSrc2.au16[0];
13043 uDstOut.au16[2] = uSrc1.au16[1];
13044 uDstOut.au16[3] = uSrc2.au16[1];
13045 *puDst = uDstOut.u;
13046}
13047
13048
13049IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13050{
13051 RTUINT128U const uSrc2 = *puSrc;
13052 RTUINT128U const uSrc1 = *puDst;
13053 ASMCompilerBarrier();
13054 RTUINT128U uDstOut;
13055 uDstOut.au16[0] = uSrc1.au16[0];
13056 uDstOut.au16[1] = uSrc2.au16[0];
13057 uDstOut.au16[2] = uSrc1.au16[1];
13058 uDstOut.au16[3] = uSrc2.au16[1];
13059 uDstOut.au16[4] = uSrc1.au16[2];
13060 uDstOut.au16[5] = uSrc2.au16[2];
13061 uDstOut.au16[6] = uSrc1.au16[3];
13062 uDstOut.au16[7] = uSrc2.au16[3];
13063 *puDst = uDstOut;
13064}
13065
13066#endif
13067
13068IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13069{
13070 RTUINT128U const uSrc2 = *puSrc2;
13071 RTUINT128U const uSrc1 = *puSrc1;
13072 ASMCompilerBarrier();
13073 RTUINT128U uDstOut;
13074 uDstOut.au16[0] = uSrc1.au16[0];
13075 uDstOut.au16[1] = uSrc2.au16[0];
13076 uDstOut.au16[2] = uSrc1.au16[1];
13077 uDstOut.au16[3] = uSrc2.au16[1];
13078 uDstOut.au16[4] = uSrc1.au16[2];
13079 uDstOut.au16[5] = uSrc2.au16[2];
13080 uDstOut.au16[6] = uSrc1.au16[3];
13081 uDstOut.au16[7] = uSrc2.au16[3];
13082 *puDst = uDstOut;
13083}
13084
13085
13086IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13087{
13088 RTUINT256U const uSrc2 = *puSrc2;
13089 RTUINT256U const uSrc1 = *puSrc1;
13090 ASMCompilerBarrier();
13091 RTUINT256U uDstOut;
13092 uDstOut.au16[0] = uSrc1.au16[0];
13093 uDstOut.au16[1] = uSrc2.au16[0];
13094 uDstOut.au16[2] = uSrc1.au16[1];
13095 uDstOut.au16[3] = uSrc2.au16[1];
13096 uDstOut.au16[4] = uSrc1.au16[2];
13097 uDstOut.au16[5] = uSrc2.au16[2];
13098 uDstOut.au16[6] = uSrc1.au16[3];
13099 uDstOut.au16[7] = uSrc2.au16[3];
13100
13101 uDstOut.au16[8] = uSrc1.au16[8];
13102 uDstOut.au16[9] = uSrc2.au16[8];
13103 uDstOut.au16[10] = uSrc1.au16[9];
13104 uDstOut.au16[11] = uSrc2.au16[9];
13105 uDstOut.au16[12] = uSrc1.au16[10];
13106 uDstOut.au16[13] = uSrc2.au16[10];
13107 uDstOut.au16[14] = uSrc1.au16[11];
13108 uDstOut.au16[15] = uSrc2.au16[11];
13109 *puDst = uDstOut;
13110}
13111
13112
13113/*
13114 * PUNPCKLBW - low dwords -> qword(s)
13115 */
13116#ifdef IEM_WITHOUT_ASSEMBLY
13117
13118IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
13119{
13120 RTUINT64U const uSrc2 = { *puSrc };
13121 RTUINT64U const uSrc1 = { *puDst };
13122 ASMCompilerBarrier();
13123 RTUINT64U uDstOut;
13124 uDstOut.au32[0] = uSrc1.au32[0];
13125 uDstOut.au32[1] = uSrc2.au32[0];
13126 *puDst = uDstOut.u;
13127}
13128
13129
13130IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13131{
13132 RTUINT128U const uSrc2 = *puSrc;
13133 RTUINT128U const uSrc1 = *puDst;
13134 ASMCompilerBarrier();
13135 RTUINT128U uDstOut;
13136 uDstOut.au32[0] = uSrc1.au32[0];
13137 uDstOut.au32[1] = uSrc2.au32[0];
13138 uDstOut.au32[2] = uSrc1.au32[1];
13139 uDstOut.au32[3] = uSrc2.au32[1];
13140 *puDst = uDstOut;
13141}
13142
13143#endif
13144
13145IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13146{
13147 RTUINT128U const uSrc2 = *puSrc2;
13148 RTUINT128U const uSrc1 = *puSrc1;
13149 ASMCompilerBarrier();
13150 RTUINT128U uDstOut;
13151 uDstOut.au32[0] = uSrc1.au32[0];
13152 uDstOut.au32[1] = uSrc2.au32[0];
13153 uDstOut.au32[2] = uSrc1.au32[1];
13154 uDstOut.au32[3] = uSrc2.au32[1];
13155 *puDst = uDstOut;
13156}
13157
13158
13159IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13160{
13161 RTUINT256U const uSrc2 = *puSrc2;
13162 RTUINT256U const uSrc1 = *puSrc1;
13163 ASMCompilerBarrier();
13164 RTUINT256U uDstOut;
13165 uDstOut.au32[0] = uSrc1.au32[0];
13166 uDstOut.au32[1] = uSrc2.au32[0];
13167 uDstOut.au32[2] = uSrc1.au32[1];
13168 uDstOut.au32[3] = uSrc2.au32[1];
13169
13170 uDstOut.au32[4] = uSrc1.au32[4];
13171 uDstOut.au32[5] = uSrc2.au32[4];
13172 uDstOut.au32[6] = uSrc1.au32[5];
13173 uDstOut.au32[7] = uSrc2.au32[5];
13174 *puDst = uDstOut;
13175}
13176
13177
13178/*
13179 * PUNPCKLQDQ -> Low qwords -> double qword(s).
13180 */
13181#ifdef IEM_WITHOUT_ASSEMBLY
13182IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13183{
13184 RTUINT128U const uSrc2 = *puSrc;
13185 RTUINT128U const uSrc1 = *puDst;
13186 ASMCompilerBarrier();
13187 RTUINT128U uDstOut;
13188 uDstOut.au64[0] = uSrc1.au64[0];
13189 uDstOut.au64[1] = uSrc2.au64[0];
13190 *puDst = uDstOut;
13191}
13192#endif
13193
13194
13195IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13196{
13197 RTUINT128U const uSrc2 = *puSrc2;
13198 RTUINT128U const uSrc1 = *puSrc1;
13199 ASMCompilerBarrier();
13200 RTUINT128U uDstOut;
13201 uDstOut.au64[0] = uSrc1.au64[0];
13202 uDstOut.au64[1] = uSrc2.au64[0];
13203 *puDst = uDstOut;
13204}
13205
13206
13207IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13208{
13209 RTUINT256U const uSrc2 = *puSrc2;
13210 RTUINT256U const uSrc1 = *puSrc1;
13211 ASMCompilerBarrier();
13212 RTUINT256U uDstOut;
13213 uDstOut.au64[0] = uSrc1.au64[0];
13214 uDstOut.au64[1] = uSrc2.au64[0];
13215
13216 uDstOut.au64[2] = uSrc1.au64[2];
13217 uDstOut.au64[3] = uSrc2.au64[2];
13218 *puDst = uDstOut;
13219}
13220
13221
13222/*
13223 * PACKSSWB - signed words -> signed bytes
13224 */
13225
13226#ifdef IEM_WITHOUT_ASSEMBLY
13227
13228IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13229{
13230 RTUINT64U const uSrc2 = { *puSrc };
13231 RTUINT64U const uSrc1 = { *puDst };
13232 ASMCompilerBarrier();
13233 RTUINT64U uDstOut;
13234 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13235 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13236 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13237 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13238 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13239 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13240 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13241 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13242 *puDst = uDstOut.u;
13243}
13244
13245
13246IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13247{
13248 RTUINT128U const uSrc2 = *puSrc;
13249 RTUINT128U const uSrc1 = *puDst;
13250 ASMCompilerBarrier();
13251 RTUINT128U uDstOut;
13252 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13253 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13254 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13255 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13256 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13257 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13258 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13259 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13260 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13261 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13262 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13263 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13264 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13265 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13266 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13267 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13268 *puDst = uDstOut;
13269}
13270
13271#endif
13272
13273IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13274{
13275 RTUINT128U const uSrc2 = *puSrc2;
13276 RTUINT128U const uSrc1 = *puSrc1;
13277 ASMCompilerBarrier();
13278 RTUINT128U uDstOut;
13279 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13280 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13281 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13282 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13283 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13284 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13285 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13286 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13287 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13288 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13289 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13290 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13291 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13292 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13293 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13294 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13295 *puDst = uDstOut;
13296}
13297
13298
13299IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13300{
13301 RTUINT256U const uSrc2 = *puSrc2;
13302 RTUINT256U const uSrc1 = *puSrc1;
13303 ASMCompilerBarrier();
13304 RTUINT256U uDstOut;
13305 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
13306 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
13307 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
13308 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
13309 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
13310 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
13311 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
13312 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
13313 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
13314 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
13315 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
13316 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
13317 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
13318 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
13319 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
13320 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
13321
13322 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
13323 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
13324 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
13325 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
13326 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
13327 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
13328 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
13329 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
13330 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
13331 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
13332 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
13333 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
13334 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
13335 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
13336 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
13337 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
13338 *puDst = uDstOut;
13339}
13340
13341
13342/*
13343 * PACKUSWB - signed words -> unsigned bytes
13344 */
13345#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
13346 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
13347 ? (uint8_t)(a_iWord) \
13348 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
13349
13350#ifdef IEM_WITHOUT_ASSEMBLY
13351
13352IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
13353{
13354 RTUINT64U const uSrc2 = { *puSrc };
13355 RTUINT64U const uSrc1 = { *puDst };
13356 ASMCompilerBarrier();
13357 RTUINT64U uDstOut;
13358 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13359 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13360 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13361 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13362 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13363 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13364 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13365 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13366 *puDst = uDstOut.u;
13367}
13368
13369
13370IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13371{
13372 RTUINT128U const uSrc2 = *puSrc;
13373 RTUINT128U const uSrc1 = *puDst;
13374 ASMCompilerBarrier();
13375 RTUINT128U uDstOut;
13376 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13377 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13378 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13379 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13380 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13381 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13382 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13383 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13384 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13385 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13386 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13387 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13388 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13389 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13390 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13391 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13392 *puDst = uDstOut;
13393}
13394
13395#endif
13396
13397IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13398{
13399 RTUINT128U const uSrc2 = *puSrc2;
13400 RTUINT128U const uSrc1 = *puSrc1;
13401 ASMCompilerBarrier();
13402 RTUINT128U uDstOut;
13403 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13404 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13405 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13406 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13407 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13408 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13409 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13410 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13411 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13412 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13413 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13414 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13415 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13416 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13417 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13418 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13419 *puDst = uDstOut;
13420}
13421
13422
13423IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13424{
13425 RTUINT256U const uSrc2 = *puSrc2;
13426 RTUINT256U const uSrc1 = *puSrc1;
13427 ASMCompilerBarrier();
13428 RTUINT256U uDstOut;
13429 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
13430 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
13431 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
13432 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
13433 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
13434 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
13435 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
13436 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
13437 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
13438 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
13439 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
13440 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
13441 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
13442 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
13443 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
13444 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
13445
13446 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
13447 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
13448 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
13449 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
13450 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
13451 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
13452 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
13453 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
13454 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
13455 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
13456 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
13457 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
13458 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
13459 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
13460 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
13461 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
13462 *puDst = uDstOut;
13463}
13464
13465
13466/*
13467 * PACKSSDW - signed dwords -> signed words
13468 */
13469
13470#ifdef IEM_WITHOUT_ASSEMBLY
13471
13472IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13473{
13474 RTUINT64U const uSrc2 = { *puSrc };
13475 RTUINT64U const uSrc1 = { *puDst };
13476 ASMCompilerBarrier();
13477 RTUINT64U uDstOut;
13478 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13479 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13480 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13481 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13482 *puDst = uDstOut.u;
13483}
13484
13485
13486IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13487{
13488 RTUINT128U const uSrc2 = *puSrc;
13489 RTUINT128U const uSrc1 = *puDst;
13490 ASMCompilerBarrier();
13491 RTUINT128U uDstOut;
13492 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13493 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13494 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13495 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13496 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13497 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13498 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13499 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13500 *puDst = uDstOut;
13501}
13502
13503#endif
13504
13505IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13506{
13507 RTUINT128U const uSrc2 = *puSrc2;
13508 RTUINT128U const uSrc1 = *puSrc1;
13509 ASMCompilerBarrier();
13510 RTUINT128U uDstOut;
13511 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13512 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13513 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13514 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13515 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13516 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13517 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13518 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13519 *puDst = uDstOut;
13520}
13521
13522
13523IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13524{
13525 RTUINT256U const uSrc2 = *puSrc2;
13526 RTUINT256U const uSrc1 = *puSrc1;
13527 ASMCompilerBarrier();
13528 RTUINT256U uDstOut;
13529 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
13530 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
13531 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
13532 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
13533 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
13534 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
13535 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
13536 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
13537
13538 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
13539 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
13540 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
13541 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
13542 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
13543 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
13544 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
13545 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
13546 *puDst = uDstOut;
13547}
13548
13549
13550/*
13551 * PACKUSDW - signed dwords -> unsigned words
13552 */
13553#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
13554 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
13555 ? (uint16_t)(a_iDword) \
13556 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
13557
13558#ifdef IEM_WITHOUT_ASSEMBLY
13559IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13560{
13561 RTUINT128U const uSrc2 = *puSrc;
13562 RTUINT128U const uSrc1 = *puDst;
13563 ASMCompilerBarrier();
13564 RTUINT128U uDstOut;
13565 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13566 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13567 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13568 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13569 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13570 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13571 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13572 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13573 *puDst = uDstOut;
13574}
13575#endif
13576
13577IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13578{
13579 RTUINT128U const uSrc2 = *puSrc2;
13580 RTUINT128U const uSrc1 = *puSrc1;
13581 ASMCompilerBarrier();
13582 RTUINT128U uDstOut;
13583 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13584 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13585 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13586 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13587 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13588 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13589 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13590 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13591 *puDst = uDstOut;
13592}
13593
13594
13595IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13596{
13597 RTUINT256U const uSrc2 = *puSrc2;
13598 RTUINT256U const uSrc1 = *puSrc1;
13599 ASMCompilerBarrier();
13600 RTUINT256U uDstOut;
13601 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
13602 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
13603 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
13604 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
13605 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
13606 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
13607 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
13608 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
13609
13610 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
13611 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
13612 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
13613 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
13614 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
13615 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
13616 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
13617 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
13618 *puDst = uDstOut;
13619}
13620
13621
13622/*
13623 * [V]PABSB / [V]PABSW / [V]PABSD
13624 */
13625
13626IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13627{
13628 RTUINT64U const uSrc = { *puSrc };
13629 RTUINT64U uDstOut = { 0 };
13630
13631 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
13632 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
13633 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
13634 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
13635 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
13636 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
13637 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
13638 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
13639 *puDst = uDstOut.u;
13640}
13641
13642
13643IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13644{
13645 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13646 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13647 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13648 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13649 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13650 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13651 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13652 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13653 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13654 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13655 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13656 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13657 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13658 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13659 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13660 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13661}
13662
13663
13664IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13665{
13666 RTUINT64U const uSrc = { *puSrc };
13667 RTUINT64U uDstOut = { 0 };
13668
13669 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
13670 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
13671 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
13672 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
13673 *puDst = uDstOut.u;
13674}
13675
13676
13677IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13678{
13679 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13680 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13681 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13682 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13683 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13684 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13685 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13686 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13687}
13688
13689
13690IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13691{
13692 RTUINT64U const uSrc = { *puSrc };
13693 RTUINT64U uDstOut = { 0 };
13694
13695 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
13696 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
13697 *puDst = uDstOut.u;
13698}
13699
13700
13701IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13702{
13703 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13704 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13705 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13706 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13707}
13708
13709
13710IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13711{
13712 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13713 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13714 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13715 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13716 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13717 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13718 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13719 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13720 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13721 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13722 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13723 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13724 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13725 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13726 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13727 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13728}
13729
13730
13731IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13732{
13733 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13734 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13735 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13736 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13737 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13738 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13739 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13740 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13741 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13742 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13743 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13744 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13745 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13746 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13747 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13748 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13749 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13750 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13751 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13752 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13753 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13754 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13755 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13756 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13757 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13758 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13759 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13760 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13761 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13762 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13763 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13764 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13765}
13766
13767
13768IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13769{
13770 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13771 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13772 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13773 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13774 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13775 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13776 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13777 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13778}
13779
13780
13781IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13782{
13783 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13784 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13785 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13786 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13787 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13788 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13789 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13790 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13791 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13792 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13793 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13794 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13795 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13796 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13797 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13798 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13799}
13800
13801
13802IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13803{
13804 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13805 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13806 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13807 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13808}
13809
13810
13811IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13812{
13813 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13814 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13815 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13816 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13817 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13818 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13819 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13820 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13821}
13822
13823
13824/*
13825 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13826 */
13827IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13828{
13829 RTUINT64U uSrc1 = { *puDst };
13830 RTUINT64U uSrc2 = { *puSrc };
13831 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13832
13833 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13834 {
13835 if (uSrc2.ai8[i] < 0)
13836 uDst.ai8[i] = -uSrc1.ai8[i];
13837 else if (uSrc2.ai8[i] == 0)
13838 uDst.ai8[i] = 0;
13839 else /* uSrc2.ai8[i] > 0 */
13840 uDst.ai8[i] = uSrc1.ai8[i];
13841 }
13842
13843 *puDst = uDst.u;
13844}
13845
13846
13847IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13848{
13849 RTUINT128U uSrc1 = *puDst;
13850
13851 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13852 {
13853 if (puSrc->ai8[i] < 0)
13854 puDst->ai8[i] = -uSrc1.ai8[i];
13855 else if (puSrc->ai8[i] == 0)
13856 puDst->ai8[i] = 0;
13857 else /* puSrc->ai8[i] > 0 */
13858 puDst->ai8[i] = uSrc1.ai8[i];
13859 }
13860}
13861
13862
13863IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13864{
13865 RTUINT64U uSrc1 = { *puDst };
13866 RTUINT64U uSrc2 = { *puSrc };
13867 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13868
13869 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13870 {
13871 if (uSrc2.ai16[i] < 0)
13872 uDst.ai16[i] = -uSrc1.ai16[i];
13873 else if (uSrc2.ai16[i] == 0)
13874 uDst.ai16[i] = 0;
13875 else /* uSrc2.ai16[i] > 0 */
13876 uDst.ai16[i] = uSrc1.ai16[i];
13877 }
13878
13879 *puDst = uDst.u;
13880}
13881
13882
13883IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13884{
13885 RTUINT128U uSrc1 = *puDst;
13886
13887 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13888 {
13889 if (puSrc->ai16[i] < 0)
13890 puDst->ai16[i] = -uSrc1.ai16[i];
13891 else if (puSrc->ai16[i] == 0)
13892 puDst->ai16[i] = 0;
13893 else /* puSrc->ai16[i] > 0 */
13894 puDst->ai16[i] = uSrc1.ai16[i];
13895 }
13896}
13897
13898
13899IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
13900{
13901 RTUINT64U uSrc1 = { *puDst };
13902 RTUINT64U uSrc2 = { *puSrc };
13903 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13904
13905 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13906 {
13907 if (uSrc2.ai32[i] < 0)
13908 uDst.ai32[i] = -uSrc1.ai32[i];
13909 else if (uSrc2.ai32[i] == 0)
13910 uDst.ai32[i] = 0;
13911 else /* uSrc2.ai32[i] > 0 */
13912 uDst.ai32[i] = uSrc1.ai32[i];
13913 }
13914
13915 *puDst = uDst.u;
13916}
13917
13918
13919IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13920{
13921 RTUINT128U uSrc1 = *puDst;
13922
13923 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13924 {
13925 if (puSrc->ai32[i] < 0)
13926 puDst->ai32[i] = -uSrc1.ai32[i];
13927 else if (puSrc->ai32[i] == 0)
13928 puDst->ai32[i] = 0;
13929 else /* puSrc->ai32[i] > 0 */
13930 puDst->ai32[i] = uSrc1.ai32[i];
13931 }
13932}
13933
13934
13935IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13936{
13937 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13938 {
13939 if (puSrc2->ai8[i] < 0)
13940 puDst->ai8[i] = -puSrc1->ai8[i];
13941 else if (puSrc2->ai8[i] == 0)
13942 puDst->ai8[i] = 0;
13943 else /* puSrc2->ai8[i] > 0 */
13944 puDst->ai8[i] = puSrc1->ai8[i];
13945 }
13946}
13947
13948
13949IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13950{
13951 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13952 {
13953 if (puSrc2->ai8[i] < 0)
13954 puDst->ai8[i] = -puSrc1->ai8[i];
13955 else if (puSrc2->ai8[i] == 0)
13956 puDst->ai8[i] = 0;
13957 else /* puSrc2->ai8[i] > 0 */
13958 puDst->ai8[i] = puSrc1->ai8[i];
13959 }
13960}
13961
13962
13963IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13964{
13965 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13966 {
13967 if (puSrc2->ai16[i] < 0)
13968 puDst->ai16[i] = -puSrc1->ai16[i];
13969 else if (puSrc2->ai16[i] == 0)
13970 puDst->ai16[i] = 0;
13971 else /* puSrc2->ai16[i] > 0 */
13972 puDst->ai16[i] = puSrc1->ai16[i];
13973 }
13974}
13975
13976
13977IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13978{
13979 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13980 {
13981 if (puSrc2->ai16[i] < 0)
13982 puDst->ai16[i] = -puSrc1->ai16[i];
13983 else if (puSrc2->ai16[i] == 0)
13984 puDst->ai16[i] = 0;
13985 else /* puSrc2->ai16[i] > 0 */
13986 puDst->ai16[i] = puSrc1->ai16[i];
13987 }
13988}
13989
13990
13991IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13992{
13993 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13994 {
13995 if (puSrc2->ai32[i] < 0)
13996 puDst->ai32[i] = -puSrc1->ai32[i];
13997 else if (puSrc2->ai32[i] == 0)
13998 puDst->ai32[i] = 0;
13999 else /* puSrc2->ai32[i] > 0 */
14000 puDst->ai32[i] = puSrc1->ai32[i];
14001 }
14002}
14003
14004
14005IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14006{
14007 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
14008 {
14009 if (puSrc2->ai32[i] < 0)
14010 puDst->ai32[i] = -puSrc1->ai32[i];
14011 else if (puSrc2->ai32[i] == 0)
14012 puDst->ai32[i] = 0;
14013 else /* puSrc2->ai32[i] > 0 */
14014 puDst->ai32[i] = puSrc1->ai32[i];
14015 }
14016}
14017
14018
14019/*
14020 * PHADDW / VPHADDW / PHADDD / VPHADDD
14021 */
14022IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14023{
14024 RTUINT64U uSrc1 = { *puDst };
14025 RTUINT64U uSrc2 = { *puSrc };
14026 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14027
14028 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14029 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14030 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
14031 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
14032 *puDst = uDst.u;
14033}
14034
14035
14036IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14037{
14038 RTUINT128U uSrc1 = *puDst;
14039
14040 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
14041 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
14042 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
14043 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
14044
14045 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
14046 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
14047 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
14048 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
14049}
14050
14051
14052IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14053{
14054 RTUINT64U uSrc1 = { *puDst };
14055 RTUINT64U uSrc2 = { *puSrc };
14056 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14057
14058 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14059 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
14060 *puDst = uDst.u;
14061}
14062
14063
14064IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14065{
14066 RTUINT128U uSrc1 = *puDst;
14067
14068 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
14069 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
14070
14071 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
14072 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
14073}
14074
14075
14076IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14077{
14078 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14079
14080 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
14081 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
14082 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
14083 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
14084
14085 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
14086 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
14087 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
14088 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
14089
14090 puDst->au64[0] = uDst.au64[0];
14091 puDst->au64[1] = uDst.au64[1];
14092}
14093
14094
14095IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14096{
14097 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14098
14099 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
14100 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
14101 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
14102 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
14103 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
14104 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
14105 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
14106 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
14107
14108 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
14109 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
14110 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
14111 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
14112 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
14113 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
14114 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
14115 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
14116
14117 puDst->au64[0] = uDst.au64[0];
14118 puDst->au64[1] = uDst.au64[1];
14119 puDst->au64[2] = uDst.au64[2];
14120 puDst->au64[3] = uDst.au64[3];
14121}
14122
14123
14124IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14125{
14126 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14127
14128 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
14129 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
14130
14131 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
14132 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
14133
14134 puDst->au64[0] = uDst.au64[0];
14135 puDst->au64[1] = uDst.au64[1];
14136}
14137
14138
14139IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14140{
14141 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14142
14143 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
14144 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
14145 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
14146 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
14147
14148 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
14149 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
14150 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
14151 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
14152
14153 puDst->au64[0] = uDst.au64[0];
14154 puDst->au64[1] = uDst.au64[1];
14155 puDst->au64[2] = uDst.au64[2];
14156 puDst->au64[3] = uDst.au64[3];
14157}
14158
14159
14160/*
14161 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
14162 */
14163IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14164{
14165 RTUINT64U uSrc1 = { *puDst };
14166 RTUINT64U uSrc2 = { *puSrc };
14167 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14168
14169 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14170 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14171 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
14172 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
14173 *puDst = uDst.u;
14174}
14175
14176
14177IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14178{
14179 RTUINT128U uSrc1 = *puDst;
14180
14181 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
14182 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
14183 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
14184 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
14185
14186 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
14187 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
14188 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
14189 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
14190}
14191
14192
14193IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14194{
14195 RTUINT64U uSrc1 = { *puDst };
14196 RTUINT64U uSrc2 = { *puSrc };
14197 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14198
14199 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14200 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
14201 *puDst = uDst.u;
14202}
14203
14204
14205IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14206{
14207 RTUINT128U uSrc1 = *puDst;
14208
14209 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
14210 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
14211
14212 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
14213 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
14214}
14215
14216
14217IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14218{
14219 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14220
14221 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
14222 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
14223 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
14224 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
14225
14226 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
14227 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
14228 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
14229 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
14230
14231 puDst->au64[0] = uDst.au64[0];
14232 puDst->au64[1] = uDst.au64[1];
14233}
14234
14235
14236IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14237{
14238 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14239
14240 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
14241 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
14242 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
14243 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
14244 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
14245 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
14246 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
14247 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
14248
14249 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
14250 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
14251 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
14252 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
14253 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
14254 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
14255 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
14256 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
14257
14258 puDst->au64[0] = uDst.au64[0];
14259 puDst->au64[1] = uDst.au64[1];
14260 puDst->au64[2] = uDst.au64[2];
14261 puDst->au64[3] = uDst.au64[3];
14262}
14263
14264
14265IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14266{
14267 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14268
14269 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
14270 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
14271
14272 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
14273 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
14274
14275 puDst->au64[0] = uDst.au64[0];
14276 puDst->au64[1] = uDst.au64[1];
14277}
14278
14279
14280IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14281{
14282 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14283
14284 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
14285 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
14286 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
14287 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
14288
14289 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
14290 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
14291 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
14292 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
14293
14294 puDst->au64[0] = uDst.au64[0];
14295 puDst->au64[1] = uDst.au64[1];
14296 puDst->au64[2] = uDst.au64[2];
14297 puDst->au64[3] = uDst.au64[3];
14298}
14299
14300
14301/*
14302 * PHADDSW / VPHADDSW
14303 */
14304IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14305{
14306 RTUINT64U uSrc1 = { *puDst };
14307 RTUINT64U uSrc2 = { *puSrc };
14308 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14309
14310 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14311 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14312 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
14313 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
14314 *puDst = uDst.u;
14315}
14316
14317
14318IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14319{
14320 RTUINT128U uSrc1 = *puDst;
14321
14322 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
14323 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
14324 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
14325 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
14326
14327 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
14328 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
14329 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
14330 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
14331}
14332
14333
14334IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14335{
14336 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14337
14338 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
14339 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
14340 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
14341 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
14342
14343 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
14344 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
14345 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
14346 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
14347
14348 puDst->au64[0] = uDst.au64[0];
14349 puDst->au64[1] = uDst.au64[1];
14350}
14351
14352
14353IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14354{
14355 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14356
14357 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
14358 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
14359 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
14360 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
14361 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
14362 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
14363 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
14364 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
14365
14366 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
14367 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
14368 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
14369 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
14370 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
14371 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
14372 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
14373 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
14374
14375 puDst->au64[0] = uDst.au64[0];
14376 puDst->au64[1] = uDst.au64[1];
14377 puDst->au64[2] = uDst.au64[2];
14378 puDst->au64[3] = uDst.au64[3];
14379}
14380
14381
14382/*
14383 * PHSUBSW / VPHSUBSW
14384 */
14385IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14386{
14387 RTUINT64U uSrc1 = { *puDst };
14388 RTUINT64U uSrc2 = { *puSrc };
14389 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14390
14391 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14392 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14393 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
14394 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
14395 *puDst = uDst.u;
14396}
14397
14398
14399IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14400{
14401 RTUINT128U uSrc1 = *puDst;
14402
14403 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
14404 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
14405 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
14406 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
14407
14408 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
14409 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
14410 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
14411 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
14412}
14413
14414
14415IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14416{
14417 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14418
14419 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
14420 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
14421 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
14422 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
14423
14424 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
14425 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
14426 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
14427 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
14428
14429 puDst->au64[0] = uDst.au64[0];
14430 puDst->au64[1] = uDst.au64[1];
14431}
14432
14433
14434IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14435{
14436 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14437
14438 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
14439 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
14440 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
14441 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
14442 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
14443 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
14444 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
14445 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
14446
14447 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
14448 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
14449 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
14450 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
14451 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
14452 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
14453 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
14454 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
14455
14456 puDst->au64[0] = uDst.au64[0];
14457 puDst->au64[1] = uDst.au64[1];
14458 puDst->au64[2] = uDst.au64[2];
14459 puDst->au64[3] = uDst.au64[3];
14460}
14461
14462
14463/*
14464 * PMADDUBSW / VPMADDUBSW
14465 */
14466IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14467{
14468 RTUINT64U uSrc1 = { *puDst };
14469 RTUINT64U uSrc2 = { *puSrc };
14470 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
14471
14472 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
14473 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
14474 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
14475 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
14476 *puDst = uDst.u;
14477}
14478
14479
14480IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14481{
14482 RTUINT128U uSrc1 = *puDst;
14483
14484 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
14485 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
14486 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
14487 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
14488 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
14489 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
14490 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
14491 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
14492}
14493
14494
14495IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14496{
14497 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14498
14499 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14500 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14501 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14502 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14503 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14504 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14505 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14506 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14507
14508 puDst->au64[0] = uDst.au64[0];
14509 puDst->au64[1] = uDst.au64[1];
14510}
14511
14512
14513IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14514{
14515 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14516
14517 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
14518 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
14519 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
14520 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
14521 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
14522 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
14523 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
14524 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
14525 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
14526 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
14527 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
14528 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
14529 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
14530 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
14531 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
14532 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
14533
14534 puDst->au64[0] = uDst.au64[0];
14535 puDst->au64[1] = uDst.au64[1];
14536 puDst->au64[2] = uDst.au64[2];
14537 puDst->au64[3] = uDst.au64[3];
14538}
14539
14540
14541/*
14542 * PMULHRSW / VPMULHRSW
14543 */
14544#define DO_PMULHRSW(a_Src1, a_Src2) \
14545 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
14546
14547IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(uint64_t *puDst, uint64_t const *puSrc))
14548{
14549 RTUINT64U uSrc1 = { *puDst };
14550 RTUINT64U uSrc2 = { *puSrc };
14551 RTUINT64U uDst;
14552
14553 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
14554 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
14555 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
14556 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
14557 *puDst = uDst.u;
14558}
14559
14560
14561IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14562{
14563 RTUINT128U uSrc1 = *puDst;
14564
14565 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
14566 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
14567 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
14568 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
14569 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
14570 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
14571 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
14572 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
14573}
14574
14575
14576IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14577{
14578 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
14579
14580 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
14581 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
14582 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
14583 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
14584 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
14585 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
14586 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
14587 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
14588
14589 puDst->au64[0] = uDst.au64[0];
14590 puDst->au64[1] = uDst.au64[1];
14591}
14592
14593
14594IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14595{
14596 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
14597
14598 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
14599 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
14600 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
14601 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
14602 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
14603 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
14604 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
14605 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
14606 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
14607 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
14608 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
14609 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
14610 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
14611 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
14612 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
14613 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
14614
14615 puDst->au64[0] = uDst.au64[0];
14616 puDst->au64[1] = uDst.au64[1];
14617 puDst->au64[2] = uDst.au64[2];
14618 puDst->au64[3] = uDst.au64[3];
14619}
14620
14621
14622/*
14623 * PSADBW / VPSADBW
14624 */
14625#ifdef IEM_WITHOUT_ASSEMBLY
14626
14627IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
14628{
14629 RTUINT64U uSrc1 = { *puDst };
14630 RTUINT64U uSrc2 = { *puSrc };
14631 RTUINT64U uDst;
14632 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14633 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14634 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14635 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14636 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14637 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14638 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14639 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14640
14641 uDst.au64[0] = 0;
14642 uDst.au16[0] = uSum;
14643 *puDst = uDst.u;
14644}
14645
14646
14647IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14648{
14649 RTUINT128U uSrc1 = *puDst;
14650
14651 puDst->au64[0] = 0;
14652 puDst->au64[1] = 0;
14653
14654 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
14655 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
14656 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
14657 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
14658 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
14659 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
14660 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
14661 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
14662 puDst->au16[0] = uSum;
14663
14664 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
14665 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
14666 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
14667 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
14668 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
14669 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
14670 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
14671 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
14672 puDst->au16[4] = uSum;
14673}
14674
14675#endif
14676
14677IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14678{
14679 RTUINT128U uSrc1 = *puSrc1;
14680 RTUINT128U uSrc2 = *puSrc2;
14681
14682 puDst->au64[0] = 0;
14683 puDst->au64[1] = 0;
14684
14685 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
14686 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14687 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14688 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14689 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14690 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14691 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14692 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14693 puDst->au16[0] = uSum;
14694
14695 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14696 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14697 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14698 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14699 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14700 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14701 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14702 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14703 puDst->au16[4] = uSum;
14704}
14705
14706IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14707{
14708 RTUINT256U uSrc1 = *puSrc1;
14709 RTUINT256U uSrc2 = *puSrc2;
14710
14711 puDst->au64[0] = 0;
14712 puDst->au64[1] = 0;
14713 puDst->au64[2] = 0;
14714 puDst->au64[3] = 0;
14715
14716 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14717 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14718 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14719 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14720 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14721 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14722 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14723 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14724 puDst->au16[0] = uSum;
14725
14726 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14727 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14728 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14729 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14730 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14731 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14732 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14733 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14734 puDst->au16[4] = uSum;
14735
14736 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14737 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14738 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14739 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14740 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14741 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14742 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14743 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14744 puDst->au16[8] = uSum;
14745
14746 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14747 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14748 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14749 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14750 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14751 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14752 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14753 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14754 puDst->au16[12] = uSum;
14755}
14756
14757
14758/*
14759 * PMULDQ / VPMULDQ
14760 */
14761IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14762{
14763 RTUINT128U uSrc1 = *puDst;
14764
14765 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14766 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14767}
14768
14769IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14770{
14771 RTUINT128U uSrc1 = *puSrc1;
14772 RTUINT128U uSrc2 = *puSrc2;
14773
14774 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14775 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14776}
14777
14778IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14779{
14780 RTUINT256U uSrc1 = *puSrc1;
14781 RTUINT256U uSrc2 = *puSrc2;
14782
14783 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14784 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14785 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14786 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14787}
14788
14789
14790/*
14791 * PMULUDQ / VPMULUDQ
14792 */
14793#ifdef IEM_WITHOUT_ASSEMBLY
14794
14795IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(uint64_t *puDst, uint64_t const *puSrc))
14796{
14797 RTUINT64U uSrc1 = { *puDst };
14798 RTUINT64U uSrc2 = { *puSrc };
14799 ASMCompilerBarrier();
14800 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14801}
14802
14803
14804IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14805{
14806 RTUINT128U uSrc1 = *puDst;
14807 RTUINT128U uSrc2 = *puSrc;
14808 ASMCompilerBarrier();
14809 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14810 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14811}
14812
14813#endif
14814
14815IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14816{
14817 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14818 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14819 ASMCompilerBarrier();
14820 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14821 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14822}
14823
14824
14825IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14826{
14827 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14828 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14829 ASMCompilerBarrier();
14830 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14831 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14832 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14833 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14834}
14835
14836
14837/*
14838 * UNPCKLPS / VUNPCKLPS
14839 */
14840#ifdef IEM_WITHOUT_ASSEMBLY
14841IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14842{
14843 RTUINT128U uSrc1 = *puDst;
14844 RTUINT128U uSrc2 = *puSrc;
14845 ASMCompilerBarrier();
14846 puDst->au32[0] = uSrc1.au32[0];
14847 puDst->au32[1] = uSrc2.au32[0];
14848 puDst->au32[2] = uSrc1.au32[1];
14849 puDst->au32[3] = uSrc2.au32[1];
14850}
14851
14852#endif
14853
14854IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14855{
14856 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14857 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14858 ASMCompilerBarrier();
14859 puDst->au32[0] = uSrc1.au32[0];
14860 puDst->au32[1] = uSrc2.au32[0];
14861 puDst->au32[2] = uSrc1.au32[1];
14862 puDst->au32[3] = uSrc2.au32[1];
14863}
14864
14865
14866IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14867{
14868 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14869 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14870 ASMCompilerBarrier();
14871 puDst->au32[0] = uSrc1.au32[0];
14872 puDst->au32[1] = uSrc2.au32[0];
14873 puDst->au32[2] = uSrc1.au32[1];
14874 puDst->au32[3] = uSrc2.au32[1];
14875
14876 puDst->au32[4] = uSrc1.au32[4];
14877 puDst->au32[5] = uSrc2.au32[4];
14878 puDst->au32[6] = uSrc1.au32[5];
14879 puDst->au32[7] = uSrc2.au32[5];
14880}
14881
14882
14883/*
14884 * UNPCKLPD / VUNPCKLPD
14885 */
14886#ifdef IEM_WITHOUT_ASSEMBLY
14887IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14888{
14889 RTUINT128U uSrc1 = *puDst;
14890 RTUINT128U uSrc2 = *puSrc;
14891 ASMCompilerBarrier();
14892 puDst->au64[0] = uSrc1.au64[0];
14893 puDst->au64[1] = uSrc2.au64[0];
14894}
14895
14896#endif
14897
14898IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14899{
14900 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14901 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14902 ASMCompilerBarrier();
14903 puDst->au64[0] = uSrc1.au64[0];
14904 puDst->au64[1] = uSrc2.au64[0];
14905}
14906
14907
14908IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14909{
14910 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14911 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14912 ASMCompilerBarrier();
14913 puDst->au64[0] = uSrc1.au64[0];
14914 puDst->au64[1] = uSrc2.au64[0];
14915 puDst->au64[2] = uSrc1.au64[2];
14916 puDst->au64[3] = uSrc2.au64[2];
14917}
14918
14919
14920/*
14921 * UNPCKHPS / VUNPCKHPS
14922 */
14923#ifdef IEM_WITHOUT_ASSEMBLY
14924IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14925{
14926 RTUINT128U uSrc1 = *puDst;
14927 RTUINT128U uSrc2 = *puSrc;
14928 ASMCompilerBarrier();
14929 puDst->au32[0] = uSrc1.au32[2];
14930 puDst->au32[1] = uSrc2.au32[2];
14931 puDst->au32[2] = uSrc1.au32[3];
14932 puDst->au32[3] = uSrc2.au32[3];
14933}
14934
14935#endif
14936
14937IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14938{
14939 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14940 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14941 ASMCompilerBarrier();
14942 puDst->au32[0] = uSrc1.au32[2];
14943 puDst->au32[1] = uSrc2.au32[2];
14944 puDst->au32[2] = uSrc1.au32[3];
14945 puDst->au32[3] = uSrc2.au32[3];
14946}
14947
14948
14949IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14950{
14951 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14952 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14953 ASMCompilerBarrier();
14954 puDst->au32[0] = uSrc1.au32[2];
14955 puDst->au32[1] = uSrc2.au32[2];
14956 puDst->au32[2] = uSrc1.au32[3];
14957 puDst->au32[3] = uSrc2.au32[3];
14958
14959 puDst->au32[4] = uSrc1.au32[6];
14960 puDst->au32[5] = uSrc2.au32[6];
14961 puDst->au32[6] = uSrc1.au32[7];
14962 puDst->au32[7] = uSrc2.au32[7];
14963}
14964
14965
14966/*
14967 * UNPCKHPD / VUNPCKHPD
14968 */
14969#ifdef IEM_WITHOUT_ASSEMBLY
14970IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14971{
14972 RTUINT128U uSrc1 = *puDst;
14973 RTUINT128U uSrc2 = *puSrc;
14974 ASMCompilerBarrier();
14975 puDst->au64[0] = uSrc1.au64[1];
14976 puDst->au64[1] = uSrc2.au64[1];
14977}
14978
14979#endif
14980
14981IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14982{
14983 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14984 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14985 ASMCompilerBarrier();
14986 puDst->au64[0] = uSrc1.au64[1];
14987 puDst->au64[1] = uSrc2.au64[1];
14988}
14989
14990
14991IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14992{
14993 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14994 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14995 ASMCompilerBarrier();
14996 puDst->au64[0] = uSrc1.au64[1];
14997 puDst->au64[1] = uSrc2.au64[1];
14998 puDst->au64[2] = uSrc1.au64[3];
14999 puDst->au64[3] = uSrc2.au64[3];
15000}
15001
15002
15003/*
15004 * CRC32 (SEE 4.2).
15005 */
15006
15007IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
15008{
15009 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15010}
15011
15012
15013IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
15014{
15015 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15016}
15017
15018IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
15019{
15020 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15021}
15022
15023IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
15024{
15025 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
15026}
15027
15028
15029/*
15030 * PTEST (SSE 4.1) - special as it output only EFLAGS.
15031 */
15032#ifdef IEM_WITHOUT_ASSEMBLY
15033IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
15034{
15035 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15036 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15037 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15038 fEfl |= X86_EFL_ZF;
15039 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15040 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
15041 fEfl |= X86_EFL_CF;
15042 *pfEFlags = fEfl;
15043}
15044#endif
15045
15046IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
15047{
15048 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
15049 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
15050 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
15051 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
15052 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15053 fEfl |= X86_EFL_ZF;
15054 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
15055 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
15056 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
15057 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
15058 fEfl |= X86_EFL_CF;
15059 *pfEFlags = fEfl;
15060}
15061
15062
15063/*
15064 * PMOVSXBW / VPMOVSXBW
15065 */
15066IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15067{
15068 RTUINT64U uSrc1 = { uSrc };
15069 puDst->ai16[0] = uSrc1.ai8[0];
15070 puDst->ai16[1] = uSrc1.ai8[1];
15071 puDst->ai16[2] = uSrc1.ai8[2];
15072 puDst->ai16[3] = uSrc1.ai8[3];
15073 puDst->ai16[4] = uSrc1.ai8[4];
15074 puDst->ai16[5] = uSrc1.ai8[5];
15075 puDst->ai16[6] = uSrc1.ai8[6];
15076 puDst->ai16[7] = uSrc1.ai8[7];
15077}
15078
15079
15080IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15081{
15082 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15083 puDst->ai16[ 0] = uSrc1.ai8[ 0];
15084 puDst->ai16[ 1] = uSrc1.ai8[ 1];
15085 puDst->ai16[ 2] = uSrc1.ai8[ 2];
15086 puDst->ai16[ 3] = uSrc1.ai8[ 3];
15087 puDst->ai16[ 4] = uSrc1.ai8[ 4];
15088 puDst->ai16[ 5] = uSrc1.ai8[ 5];
15089 puDst->ai16[ 6] = uSrc1.ai8[ 6];
15090 puDst->ai16[ 7] = uSrc1.ai8[ 7];
15091 puDst->ai16[ 8] = uSrc1.ai8[ 8];
15092 puDst->ai16[ 9] = uSrc1.ai8[ 9];
15093 puDst->ai16[10] = uSrc1.ai8[10];
15094 puDst->ai16[11] = uSrc1.ai8[11];
15095 puDst->ai16[12] = uSrc1.ai8[12];
15096 puDst->ai16[13] = uSrc1.ai8[13];
15097 puDst->ai16[14] = uSrc1.ai8[14];
15098 puDst->ai16[15] = uSrc1.ai8[15];
15099}
15100
15101
15102/*
15103 * PMOVSXBD / VPMOVSXBD
15104 */
15105IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15106{
15107 RTUINT32U uSrc1 = { uSrc };
15108 puDst->ai32[0] = uSrc1.ai8[0];
15109 puDst->ai32[1] = uSrc1.ai8[1];
15110 puDst->ai32[2] = uSrc1.ai8[2];
15111 puDst->ai32[3] = uSrc1.ai8[3];
15112}
15113
15114
15115IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15116{
15117 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15118 puDst->ai32[0] = uSrc1.ai8[0];
15119 puDst->ai32[1] = uSrc1.ai8[1];
15120 puDst->ai32[2] = uSrc1.ai8[2];
15121 puDst->ai32[3] = uSrc1.ai8[3];
15122 puDst->ai32[4] = uSrc1.ai8[4];
15123 puDst->ai32[5] = uSrc1.ai8[5];
15124 puDst->ai32[6] = uSrc1.ai8[6];
15125 puDst->ai32[7] = uSrc1.ai8[7];
15126}
15127
15128
15129/*
15130 * PMOVSXBQ / VPMOVSXBQ
15131 */
15132IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15133{
15134 RTUINT16U uSrc1 = { uSrc };
15135 puDst->ai64[0] = uSrc1.ai8[0];
15136 puDst->ai64[1] = uSrc1.ai8[1];
15137}
15138
15139
15140IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15141{
15142 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15143 puDst->ai64[0] = uSrc1.ai8[0];
15144 puDst->ai64[1] = uSrc1.ai8[1];
15145 puDst->ai64[2] = uSrc1.ai8[2];
15146 puDst->ai64[3] = uSrc1.ai8[3];
15147}
15148
15149
15150/*
15151 * PMOVSXWD / VPMOVSXWD
15152 */
15153IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15154{
15155 RTUINT64U uSrc1 = { uSrc };
15156 puDst->ai32[0] = uSrc1.ai16[0];
15157 puDst->ai32[1] = uSrc1.ai16[1];
15158 puDst->ai32[2] = uSrc1.ai16[2];
15159 puDst->ai32[3] = uSrc1.ai16[3];
15160}
15161
15162
15163IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15164{
15165 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15166 puDst->ai32[0] = uSrc1.ai16[0];
15167 puDst->ai32[1] = uSrc1.ai16[1];
15168 puDst->ai32[2] = uSrc1.ai16[2];
15169 puDst->ai32[3] = uSrc1.ai16[3];
15170 puDst->ai32[4] = uSrc1.ai16[4];
15171 puDst->ai32[5] = uSrc1.ai16[5];
15172 puDst->ai32[6] = uSrc1.ai16[6];
15173 puDst->ai32[7] = uSrc1.ai16[7];
15174}
15175
15176
15177/*
15178 * PMOVSXWQ / VPMOVSXWQ
15179 */
15180IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15181{
15182 RTUINT32U uSrc1 = { uSrc };
15183 puDst->ai64[0] = uSrc1.ai16[0];
15184 puDst->ai64[1] = uSrc1.ai16[1];
15185}
15186
15187
15188IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15189{
15190 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15191 puDst->ai64[0] = uSrc1.ai16[0];
15192 puDst->ai64[1] = uSrc1.ai16[1];
15193 puDst->ai64[2] = uSrc1.ai16[2];
15194 puDst->ai64[3] = uSrc1.ai16[3];
15195}
15196
15197
15198/*
15199 * PMOVSXDQ / VPMOVSXDQ
15200 */
15201IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15202{
15203 RTUINT64U uSrc1 = { uSrc };
15204 puDst->ai64[0] = uSrc1.ai32[0];
15205 puDst->ai64[1] = uSrc1.ai32[1];
15206}
15207
15208
15209IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15210{
15211 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15212 puDst->ai64[0] = uSrc1.ai32[0];
15213 puDst->ai64[1] = uSrc1.ai32[1];
15214 puDst->ai64[2] = uSrc1.ai32[2];
15215 puDst->ai64[3] = uSrc1.ai32[3];
15216}
15217
15218
15219/*
15220 * PMOVZXBW / VPMOVZXBW
15221 */
15222IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15223{
15224 RTUINT64U uSrc1 = { uSrc };
15225 puDst->au16[0] = uSrc1.au8[0];
15226 puDst->au16[1] = uSrc1.au8[1];
15227 puDst->au16[2] = uSrc1.au8[2];
15228 puDst->au16[3] = uSrc1.au8[3];
15229 puDst->au16[4] = uSrc1.au8[4];
15230 puDst->au16[5] = uSrc1.au8[5];
15231 puDst->au16[6] = uSrc1.au8[6];
15232 puDst->au16[7] = uSrc1.au8[7];
15233}
15234
15235
15236IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15237{
15238 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15239 puDst->au16[ 0] = uSrc1.au8[ 0];
15240 puDst->au16[ 1] = uSrc1.au8[ 1];
15241 puDst->au16[ 2] = uSrc1.au8[ 2];
15242 puDst->au16[ 3] = uSrc1.au8[ 3];
15243 puDst->au16[ 4] = uSrc1.au8[ 4];
15244 puDst->au16[ 5] = uSrc1.au8[ 5];
15245 puDst->au16[ 6] = uSrc1.au8[ 6];
15246 puDst->au16[ 7] = uSrc1.au8[ 7];
15247 puDst->au16[ 8] = uSrc1.au8[ 8];
15248 puDst->au16[ 9] = uSrc1.au8[ 9];
15249 puDst->au16[10] = uSrc1.au8[10];
15250 puDst->au16[11] = uSrc1.au8[11];
15251 puDst->au16[12] = uSrc1.au8[12];
15252 puDst->au16[13] = uSrc1.au8[13];
15253 puDst->au16[14] = uSrc1.au8[14];
15254 puDst->au16[15] = uSrc1.au8[15];
15255}
15256
15257
15258/*
15259 * PMOVZXBD / VPMOVZXBD
15260 */
15261IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15262{
15263 RTUINT32U uSrc1 = { uSrc };
15264 puDst->au32[0] = uSrc1.au8[0];
15265 puDst->au32[1] = uSrc1.au8[1];
15266 puDst->au32[2] = uSrc1.au8[2];
15267 puDst->au32[3] = uSrc1.au8[3];
15268}
15269
15270
15271IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15272{
15273 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15274 puDst->au32[0] = uSrc1.au8[0];
15275 puDst->au32[1] = uSrc1.au8[1];
15276 puDst->au32[2] = uSrc1.au8[2];
15277 puDst->au32[3] = uSrc1.au8[3];
15278 puDst->au32[4] = uSrc1.au8[4];
15279 puDst->au32[5] = uSrc1.au8[5];
15280 puDst->au32[6] = uSrc1.au8[6];
15281 puDst->au32[7] = uSrc1.au8[7];
15282}
15283
15284
15285/*
15286 * PMOVZXBQ / VPMOVZXBQ
15287 */
15288IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
15289{
15290 RTUINT16U uSrc1 = { uSrc };
15291 puDst->au64[0] = uSrc1.au8[0];
15292 puDst->au64[1] = uSrc1.au8[1];
15293}
15294
15295
15296IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15297{
15298 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15299 puDst->au64[0] = uSrc1.au8[0];
15300 puDst->au64[1] = uSrc1.au8[1];
15301 puDst->au64[2] = uSrc1.au8[2];
15302 puDst->au64[3] = uSrc1.au8[3];
15303}
15304
15305
15306/*
15307 * PMOVZXWD / VPMOVZXWD
15308 */
15309IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15310{
15311 RTUINT64U uSrc1 = { uSrc };
15312 puDst->au32[0] = uSrc1.au16[0];
15313 puDst->au32[1] = uSrc1.au16[1];
15314 puDst->au32[2] = uSrc1.au16[2];
15315 puDst->au32[3] = uSrc1.au16[3];
15316}
15317
15318
15319IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15320{
15321 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15322 puDst->au32[0] = uSrc1.au16[0];
15323 puDst->au32[1] = uSrc1.au16[1];
15324 puDst->au32[2] = uSrc1.au16[2];
15325 puDst->au32[3] = uSrc1.au16[3];
15326 puDst->au32[4] = uSrc1.au16[4];
15327 puDst->au32[5] = uSrc1.au16[5];
15328 puDst->au32[6] = uSrc1.au16[6];
15329 puDst->au32[7] = uSrc1.au16[7];
15330}
15331
15332
15333/*
15334 * PMOVZXWQ / VPMOVZXWQ
15335 */
15336IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
15337{
15338 RTUINT32U uSrc1 = { uSrc };
15339 puDst->au64[0] = uSrc1.au16[0];
15340 puDst->au64[1] = uSrc1.au16[1];
15341}
15342
15343
15344IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15345{
15346 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15347 puDst->au64[0] = uSrc1.au16[0];
15348 puDst->au64[1] = uSrc1.au16[1];
15349 puDst->au64[2] = uSrc1.au16[2];
15350 puDst->au64[3] = uSrc1.au16[3];
15351}
15352
15353
15354/*
15355 * PMOVZXDQ / VPMOVZXDQ
15356 */
15357IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
15358{
15359 RTUINT64U uSrc1 = { uSrc };
15360 puDst->au64[0] = uSrc1.au32[0];
15361 puDst->au64[1] = uSrc1.au32[1];
15362}
15363
15364
15365IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
15366{
15367 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
15368 puDst->au64[0] = uSrc1.au32[0];
15369 puDst->au64[1] = uSrc1.au32[1];
15370 puDst->au64[2] = uSrc1.au32[2];
15371 puDst->au64[3] = uSrc1.au32[3];
15372}
15373
15374/**
15375 * Converts from the packed IPRT 32-bit (single precision) floating point format to
15376 * the SoftFloat 32-bit floating point format (float32_t).
15377 *
15378 * This is only a structure format conversion, nothing else.
15379 */
15380DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
15381{
15382 float32_t Tmp;
15383 Tmp.v = pr32Val->u;
15384 return Tmp;
15385}
15386
15387
15388/**
15389 * Converts from SoftFloat 32-bit floating point format (float32_t)
15390 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
15391 *
15392 * This is only a structure format conversion, nothing else.
15393 */
15394DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
15395{
15396 pr32Dst->u = r32XSrc.v;
15397 return pr32Dst;
15398}
15399
15400
15401/**
15402 * Converts from the packed IPRT 64-bit (single precision) floating point format to
15403 * the SoftFloat 64-bit floating point format (float64_t).
15404 *
15405 * This is only a structure format conversion, nothing else.
15406 */
15407DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
15408{
15409 float64_t Tmp;
15410 Tmp.v = pr64Val->u;
15411 return Tmp;
15412}
15413
15414
15415/**
15416 * Converts from SoftFloat 64-bit floating point format (float64_t)
15417 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
15418 *
15419 * This is only a structure format conversion, nothing else.
15420 */
15421DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
15422{
15423 pr64Dst->u = r64XSrc.v;
15424 return pr64Dst;
15425}
15426
15427
15428/** Initializer for the SoftFloat state structure. */
15429# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
15430 { \
15431 softfloat_tininess_afterRounding, \
15432 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
15433 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
15434 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
15435 : (uint8_t)softfloat_round_minMag, \
15436 0, \
15437 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
15438 32 /* Rounding precision, not relevant for SIMD. */ \
15439 }
15440
15441#ifdef IEM_WITHOUT_ASSEMBLY
15442
15443/**
15444 * Helper for transfering exception to MXCSR and setting the result value
15445 * accordingly.
15446 *
15447 * @returns Updated MXCSR.
15448 * @param pSoftState The SoftFloat state following the operation.
15449 * @param r32Result The result of the SoftFloat operation.
15450 * @param pr32Result Where to store the result for IEM.
15451 * @param fMxcsr The original MXCSR value.
15452 */
15453DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
15454 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15455{
15456 iemFpSoftF32ToIprt(pr32Result, r32Result);
15457
15458 uint8_t fXcpt = pSoftState->exceptionFlags;
15459 if ( (fMxcsr & X86_MXCSR_FZ)
15460 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
15461 {
15462 /* Underflow masked and flush to zero is set. */
15463 pr32Result->s.uFraction = 0;
15464 pr32Result->s.uExponent = 0;
15465 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15466 }
15467
15468 /* If DAZ is set \#DE is never set. */
15469 if ( fMxcsr & X86_MXCSR_DAZ
15470 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15471 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15472 fXcpt &= ~X86_MXCSR_DE;
15473
15474 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15475}
15476
15477
15478/**
15479 * Helper for transfering exception to MXCSR and setting the result value
15480 * accordingly - ignores Flush-to-Zero.
15481 *
15482 * @returns Updated MXCSR.
15483 * @param pSoftState The SoftFloat state following the operation.
15484 * @param r32Result The result of the SoftFloat operation.
15485 * @param pr32Result Where to store the result for IEM.
15486 * @param fMxcsr The original MXCSR value.
15487 */
15488DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
15489 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
15490{
15491 iemFpSoftF32ToIprt(pr32Result, r32Result);
15492
15493 uint8_t fXcpt = pSoftState->exceptionFlags;
15494 /* If DAZ is set \#DE is never set. */
15495 if ( fMxcsr & X86_MXCSR_DAZ
15496 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15497 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
15498 fXcpt &= ~X86_MXCSR_DE;
15499
15500 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15501}
15502
15503
15504/**
15505 * Helper for transfering exception to MXCSR and setting the result value
15506 * accordingly.
15507 *
15508 * @returns Updated MXCSR.
15509 * @param pSoftState The SoftFloat state following the operation.
15510 * @param r64Result The result of the SoftFloat operation.
15511 * @param pr64Result Where to store the result for IEM.
15512 * @param fMxcsr The original MXCSR value.
15513 */
15514DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
15515 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15516{
15517 iemFpSoftF64ToIprt(pr64Result, r64Result);
15518 uint8_t fXcpt = pSoftState->exceptionFlags;
15519 if ( (fMxcsr & X86_MXCSR_FZ)
15520 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
15521 {
15522 /* Underflow masked and flush to zero is set. */
15523 iemFpSoftF64ToIprt(pr64Result, r64Result);
15524 pr64Result->s.uFractionHigh = 0;
15525 pr64Result->s.uFractionLow = 0;
15526 pr64Result->s.uExponent = 0;
15527 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
15528 }
15529
15530 /* If DAZ is set \#DE is never set. */
15531 if ( fMxcsr & X86_MXCSR_DAZ
15532 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15533 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15534 fXcpt &= ~X86_MXCSR_DE;
15535
15536 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15537}
15538
15539
15540/**
15541 * Helper for transfering exception to MXCSR and setting the result value
15542 * accordingly - ignores Flush-to-Zero.
15543 *
15544 * @returns Updated MXCSR.
15545 * @param pSoftState The SoftFloat state following the operation.
15546 * @param r64Result The result of the SoftFloat operation.
15547 * @param pr64Result Where to store the result for IEM.
15548 * @param fMxcsr The original MXCSR value.
15549 */
15550DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
15551 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
15552{
15553 iemFpSoftF64ToIprt(pr64Result, r64Result);
15554
15555 uint8_t fXcpt = pSoftState->exceptionFlags;
15556 /* If DAZ is set \#DE is never set. */
15557 if ( fMxcsr & X86_MXCSR_DAZ
15558 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
15559 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
15560 fXcpt &= ~X86_MXCSR_DE;
15561
15562 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
15563}
15564
15565#endif /* IEM_WITHOUT_ASSEMBLY */
15566
15567
15568/**
15569 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
15570 * in MXCSR into account.
15571 *
15572 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15573 * @param pr32Val Where to store the result.
15574 * @param fMxcsr The input MXCSR value.
15575 * @param pr32Src The value to use.
15576 */
15577DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15578{
15579 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
15580 {
15581 if (fMxcsr & X86_MXCSR_DAZ)
15582 {
15583 /* De-normals are changed to 0. */
15584 pr32Val->s.fSign = pr32Src->s.fSign;
15585 pr32Val->s.uFraction = 0;
15586 pr32Val->s.uExponent = 0;
15587 return 0;
15588 }
15589
15590 *pr32Val = *pr32Src;
15591 return X86_MXCSR_DE;
15592 }
15593
15594 *pr32Val = *pr32Src;
15595 return 0;
15596}
15597
15598
15599/**
15600 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
15601 * in MXCSR into account.
15602 *
15603 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
15604 * @param pr64Val Where to store the result.
15605 * @param fMxcsr The input MXCSR value.
15606 * @param pr64Src The value to use.
15607 */
15608DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15609{
15610 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
15611 {
15612 if (fMxcsr & X86_MXCSR_DAZ)
15613 {
15614 /* De-normals are changed to 0. */
15615 pr64Val->s64.fSign = pr64Src->s.fSign;
15616 pr64Val->s64.uFraction = 0;
15617 pr64Val->s64.uExponent = 0;
15618 return 0;
15619 }
15620
15621 *pr64Val = *pr64Src;
15622 return X86_MXCSR_DE;
15623 }
15624
15625 *pr64Val = *pr64Src;
15626 return 0;
15627}
15628
15629#ifdef IEM_WITHOUT_ASSEMBLY
15630
15631/**
15632 * Validates the given input operands returning whether the operation can continue or whether one
15633 * of the source operands contains a NaN value, setting the output accordingly.
15634 *
15635 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15636 * @param pr32Res Where to store the result in case the operation can't continue.
15637 * @param pr32Val1 The first input operand.
15638 * @param pr32Val2 The second input operand.
15639 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15640 */
15641DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
15642{
15643 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
15644 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
15645 if (cSNan + cQNan == 2)
15646 {
15647 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15648 *pr32Res = *pr32Val1;
15649 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15650 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15651 return true;
15652 }
15653 if (cSNan)
15654 {
15655 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15656 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15657 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15658 *pfMxcsr |= X86_MXCSR_IE;
15659 return true;
15660 }
15661 if (cQNan)
15662 {
15663 /* The QNan operand is placed into the result. */
15664 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
15665 return true;
15666 }
15667
15668 Assert(!cQNan && !cSNan);
15669 return false;
15670}
15671
15672
15673/**
15674 * Validates the given double precision input operands returning whether the operation can continue or whether one
15675 * of the source operands contains a NaN value, setting the output accordingly.
15676 *
15677 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
15678 * @param pr64Res Where to store the result in case the operation can't continue.
15679 * @param pr64Val1 The first input operand.
15680 * @param pr64Val2 The second input operand.
15681 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15682 */
15683DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
15684{
15685 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
15686 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
15687 if (cSNan + cQNan == 2)
15688 {
15689 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
15690 *pr64Res = *pr64Val1;
15691 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15692 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
15693 return true;
15694 }
15695 if (cSNan)
15696 {
15697 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15698 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15699 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15700 *pfMxcsr |= X86_MXCSR_IE;
15701 return true;
15702 }
15703 if (cQNan)
15704 {
15705 /* The QNan operand is placed into the result. */
15706 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15707 return true;
15708 }
15709
15710 Assert(!cQNan && !cSNan);
15711 return false;
15712}
15713
15714
15715/**
15716 * Validates the given single input operand returning whether the operation can continue or whether
15717 * contains a NaN value, setting the output accordingly.
15718 *
15719 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15720 * @param pr32Res Where to store the result in case the operation can't continue.
15721 * @param pr32Val The input operand.
15722 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15723 */
15724DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15725{
15726 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15727 {
15728 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15729 *pr32Res = *pr32Val;
15730 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15731 *pfMxcsr |= X86_MXCSR_IE;
15732 return true;
15733 }
15734 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15735 {
15736 /* The QNan operand is placed into the result. */
15737 *pr32Res = *pr32Val;
15738 return true;
15739 }
15740
15741 return false;
15742}
15743
15744
15745/**
15746 * Validates the given double input operand returning whether the operation can continue or whether
15747 * contains a NaN value, setting the output accordingly.
15748 *
15749 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15750 * @param pr64Res Where to store the result in case the operation can't continue.
15751 * @param pr64Val The input operand.
15752 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15753 */
15754DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15755{
15756 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15757 {
15758 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15759 *pr64Res = *pr64Val;
15760 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15761 *pfMxcsr |= X86_MXCSR_IE;
15762 return true;
15763 }
15764 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15765 {
15766 /* The QNan operand is placed into the result. */
15767 *pr64Res = *pr64Val;
15768 return true;
15769 }
15770
15771 return false;
15772}
15773
15774#endif /* IEM_WITHOUT_ASSEMBLY */
15775
15776/**
15777 * ADDPS
15778 */
15779#ifdef IEM_WITHOUT_ASSEMBLY
15780static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15781{
15782 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15783 return fMxcsr;
15784
15785 RTFLOAT32U r32Src1, r32Src2;
15786 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15787 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15788 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15789 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15790 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15791}
15792
15793
15794IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15795{
15796 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15797 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15798 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15799 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15800}
15801#endif
15802
15803
15804/**
15805 * ADDSS
15806 */
15807#ifdef IEM_WITHOUT_ASSEMBLY
15808IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15809{
15810 pResult->ar32[1] = puSrc1->ar32[1];
15811 pResult->ar32[2] = puSrc1->ar32[2];
15812 pResult->ar32[3] = puSrc1->ar32[3];
15813 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15814}
15815#endif
15816
15817
15818/**
15819 * ADDPD
15820 */
15821#ifdef IEM_WITHOUT_ASSEMBLY
15822static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15823{
15824 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15825 return fMxcsr;
15826
15827 RTFLOAT64U r64Src1, r64Src2;
15828 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15829 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15830 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15831 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15832 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15833}
15834
15835
15836IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15837{
15838 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15839 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15840}
15841#endif
15842
15843
15844/**
15845 * ADDSD
15846 */
15847#ifdef IEM_WITHOUT_ASSEMBLY
15848IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15849{
15850 pResult->ar64[1] = puSrc1->ar64[1];
15851 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15852}
15853#endif
15854
15855
15856/**
15857 * MULPS
15858 */
15859#ifdef IEM_WITHOUT_ASSEMBLY
15860static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15861{
15862 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15863 return fMxcsr;
15864
15865 RTFLOAT32U r32Src1, r32Src2;
15866 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15867 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15868 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15869 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15870 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15871}
15872
15873
15874IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15875{
15876 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15877 | iemAImpl_mulps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15878 | iemAImpl_mulps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15879 | iemAImpl_mulps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15880}
15881#endif
15882
15883
15884/**
15885 * MULSS
15886 */
15887#ifdef IEM_WITHOUT_ASSEMBLY
15888IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15889{
15890 pResult->ar32[1] = puSrc1->ar32[1];
15891 pResult->ar32[2] = puSrc1->ar32[2];
15892 pResult->ar32[3] = puSrc1->ar32[3];
15893 return iemAImpl_mulps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15894}
15895#endif
15896
15897
15898/**
15899 * MULPD
15900 */
15901#ifdef IEM_WITHOUT_ASSEMBLY
15902static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15903{
15904 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15905 return fMxcsr;
15906
15907 RTFLOAT64U r64Src1, r64Src2;
15908 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15909 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15910 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15911 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15912 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15913}
15914
15915
15916IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15917{
15918 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15919 | iemAImpl_mulpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15920}
15921#endif
15922
15923
15924/**
15925 * MULSD
15926 */
15927#ifdef IEM_WITHOUT_ASSEMBLY
15928IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_mulsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15929{
15930 pResult->ar64[1] = puSrc1->ar64[1];
15931 return iemAImpl_mulpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
15932}
15933#endif
15934
15935
15936/**
15937 * SUBPS
15938 */
15939#ifdef IEM_WITHOUT_ASSEMBLY
15940static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15941{
15942 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15943 return fMxcsr;
15944
15945 RTFLOAT32U r32Src1, r32Src2;
15946 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15947 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15948 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15949 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15950 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15951}
15952
15953
15954IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15955{
15956 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
15957 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
15958 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
15959 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15960}
15961#endif
15962
15963
15964/**
15965 * SUBSS
15966 */
15967#ifdef IEM_WITHOUT_ASSEMBLY
15968IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15969{
15970 pResult->ar32[1] = puSrc1->ar32[1];
15971 pResult->ar32[2] = puSrc1->ar32[2];
15972 pResult->ar32[3] = puSrc1->ar32[3];
15973 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
15974}
15975#endif
15976
15977
15978/**
15979 * SUBPD
15980 */
15981#ifdef IEM_WITHOUT_ASSEMBLY
15982static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15983{
15984 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15985 return fMxcsr;
15986
15987 RTFLOAT64U r64Src1, r64Src2;
15988 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15989 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15990 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15991 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15992 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15993}
15994
15995
15996IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15997{
15998 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
15999 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16000}
16001#endif
16002
16003
16004/**
16005 * SUBSD
16006 */
16007#ifdef IEM_WITHOUT_ASSEMBLY
16008IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_subsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16009{
16010 pResult->ar64[1] = puSrc1->ar64[1];
16011 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16012}
16013#endif
16014
16015
16016/**
16017 * MINPS
16018 */
16019#ifdef IEM_WITHOUT_ASSEMBLY
16020static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16021{
16022 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16023 {
16024 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16025 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16026 return fMxcsr | X86_MXCSR_IE;
16027 }
16028
16029 RTFLOAT32U r32Src1, r32Src2;
16030 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16031 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16032 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16033 {
16034 *pr32Res = r32Src2;
16035 return fMxcsr;
16036 }
16037
16038 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16039 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16040 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16041 fLe
16042 ? iemFpSoftF32FromIprt(&r32Src1)
16043 : iemFpSoftF32FromIprt(&r32Src2),
16044 pr32Res, fMxcsr);
16045}
16046
16047
16048IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16049{
16050 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16051 | iemAImpl_minps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16052 | iemAImpl_minps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16053 | iemAImpl_minps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16054}
16055#endif
16056
16057
16058/**
16059 * MINSS
16060 */
16061#ifdef IEM_WITHOUT_ASSEMBLY
16062IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16063{
16064 pResult->ar32[1] = puSrc1->ar32[1];
16065 pResult->ar32[2] = puSrc1->ar32[2];
16066 pResult->ar32[3] = puSrc1->ar32[3];
16067 return iemAImpl_minps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16068}
16069#endif
16070
16071
16072/**
16073 * MINPD
16074 */
16075#ifdef IEM_WITHOUT_ASSEMBLY
16076static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16077{
16078 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16079 {
16080 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16081 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16082 return fMxcsr | X86_MXCSR_IE;
16083 }
16084
16085 RTFLOAT64U r64Src1, r64Src2;
16086 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16087 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16088 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16089 {
16090 *pr64Res = r64Src2;
16091 return fMxcsr;
16092 }
16093
16094 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16095 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16096 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16097 fLe
16098 ? iemFpSoftF64FromIprt(&r64Src1)
16099 : iemFpSoftF64FromIprt(&r64Src2),
16100 pr64Res, fMxcsr);
16101}
16102
16103
16104IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16105{
16106 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16107 | iemAImpl_minpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16108}
16109#endif
16110
16111
16112/**
16113 * MINSD
16114 */
16115#ifdef IEM_WITHOUT_ASSEMBLY
16116IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_minsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16117{
16118 pResult->ar64[1] = puSrc1->ar64[1];
16119 return iemAImpl_minpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16120}
16121#endif
16122
16123
16124/**
16125 * DIVPS
16126 */
16127#ifdef IEM_WITHOUT_ASSEMBLY
16128static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16129{
16130 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
16131 return fMxcsr;
16132
16133 RTFLOAT32U r32Src1, r32Src2;
16134 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16135 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16136 if (RTFLOAT32U_IS_ZERO(&r32Src2))
16137 {
16138 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
16139 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
16140 {
16141 *pr32Res = g_ar32QNaN[1];
16142 return fMxcsr | X86_MXCSR_IE;
16143 }
16144 else if (RTFLOAT32U_IS_INF(&r32Src1))
16145 {
16146 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16147 return fMxcsr;
16148 }
16149 else
16150 {
16151 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
16152 return fMxcsr | X86_MXCSR_ZE;
16153 }
16154 }
16155
16156 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16157 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16158 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16159}
16160
16161
16162IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16163{
16164 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16165 | iemAImpl_divps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16166 | iemAImpl_divps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16167 | iemAImpl_divps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16168}
16169#endif
16170
16171
16172/**
16173 * DIVSS
16174 */
16175#ifdef IEM_WITHOUT_ASSEMBLY
16176IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16177{
16178 pResult->ar32[1] = puSrc1->ar32[1];
16179 pResult->ar32[2] = puSrc1->ar32[2];
16180 pResult->ar32[3] = puSrc1->ar32[3];
16181 return iemAImpl_divps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16182}
16183#endif
16184
16185
16186/**
16187 * DIVPD
16188 */
16189#ifdef IEM_WITHOUT_ASSEMBLY
16190static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16191{
16192 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
16193 return fMxcsr;
16194
16195 RTFLOAT64U r64Src1, r64Src2;
16196 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16197 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16198 if (RTFLOAT64U_IS_ZERO(&r64Src2))
16199 {
16200 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
16201 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
16202 {
16203 *pr64Res = g_ar64QNaN[1];
16204 return fMxcsr | X86_MXCSR_IE;
16205 }
16206 else if (RTFLOAT64U_IS_INF(&r64Src1))
16207 {
16208 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16209 return fMxcsr;
16210 }
16211 else
16212 {
16213 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
16214 return fMxcsr | X86_MXCSR_ZE;
16215 }
16216 }
16217
16218 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16219 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16220 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16221}
16222
16223
16224IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16225{
16226 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16227 | iemAImpl_divpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16228}
16229#endif
16230
16231
16232/**
16233 * DIVSD
16234 */
16235#ifdef IEM_WITHOUT_ASSEMBLY
16236IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_divsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16237{
16238 pResult->ar64[1] = puSrc1->ar64[1];
16239 return iemAImpl_divpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16240}
16241#endif
16242
16243
16244/**
16245 * MAXPS
16246 */
16247#ifdef IEM_WITHOUT_ASSEMBLY
16248static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
16249{
16250 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
16251 {
16252 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16253 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
16254 return fMxcsr | X86_MXCSR_IE;
16255 }
16256
16257 RTFLOAT32U r32Src1, r32Src2;
16258 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16259 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
16260 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
16261 {
16262 *pr32Res = r32Src2;
16263 return fMxcsr;
16264 }
16265
16266 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16267 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
16268 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
16269 fLe
16270 ? iemFpSoftF32FromIprt(&r32Src2)
16271 : iemFpSoftF32FromIprt(&r32Src1),
16272 pr32Res, fMxcsr);
16273}
16274
16275
16276IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16277{
16278 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16279 | iemAImpl_maxps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16280 | iemAImpl_maxps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16281 | iemAImpl_maxps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16282}
16283#endif
16284
16285
16286/**
16287 * MAXSS
16288 */
16289#ifdef IEM_WITHOUT_ASSEMBLY
16290IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16291{
16292 pResult->ar32[1] = puSrc1->ar32[1];
16293 pResult->ar32[2] = puSrc1->ar32[2];
16294 pResult->ar32[3] = puSrc1->ar32[3];
16295 return iemAImpl_maxps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], pr32Src2);
16296}
16297#endif
16298
16299
16300/**
16301 * MAXPD
16302 */
16303#ifdef IEM_WITHOUT_ASSEMBLY
16304static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
16305{
16306 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
16307 {
16308 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
16309 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
16310 return fMxcsr | X86_MXCSR_IE;
16311 }
16312
16313 RTFLOAT64U r64Src1, r64Src2;
16314 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16315 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
16316 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
16317 {
16318 *pr64Res = r64Src2;
16319 return fMxcsr;
16320 }
16321
16322 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16323 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
16324 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
16325 fLe
16326 ? iemFpSoftF64FromIprt(&r64Src2)
16327 : iemFpSoftF64FromIprt(&r64Src1),
16328 pr64Res, fMxcsr);
16329}
16330
16331
16332IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16333{
16334 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16335 | iemAImpl_maxpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16336}
16337#endif
16338
16339
16340/**
16341 * MAXSD
16342 */
16343#ifdef IEM_WITHOUT_ASSEMBLY
16344IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_maxsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16345{
16346 pResult->ar64[1] = puSrc1->ar64[1];
16347 return iemAImpl_maxpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], pr64Src2);
16348}
16349#endif
16350
16351
16352/**
16353 * CVTSS2SD
16354 */
16355#ifdef IEM_WITHOUT_ASSEMBLY
16356static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16357{
16358 RTFLOAT32U r32Src1;
16359 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16360
16361 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16362 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16363 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16364}
16365
16366
16367IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2sd_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16368{
16369 pResult->ar64[1] = puSrc1->ar64[1];
16370 return iemAImpl_cvtss2sd_u128_r32_worker(&pResult->ar64[0], uMxCsrIn, pr32Src2);
16371}
16372#endif
16373
16374
16375/**
16376 * CVTSD2SS
16377 */
16378#ifdef IEM_WITHOUT_ASSEMBLY
16379static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16380{
16381 RTFLOAT64U r64Src1;
16382 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16383
16384 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16385 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16386 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16387}
16388
16389
16390IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2ss_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16391{
16392 pResult->ar32[1] = puSrc1->ar32[1];
16393 pResult->ar32[2] = puSrc1->ar32[2];
16394 pResult->ar32[3] = puSrc1->ar32[3];
16395 return iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->ar32[0], uMxCsrIn, pr64Src2);
16396}
16397#endif
16398
16399
16400/**
16401 * HADDPS
16402 */
16403#ifdef IEM_WITHOUT_ASSEMBLY
16404IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16405{
16406 return iemAImpl_addps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16407 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16408 | iemAImpl_addps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16409 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16410}
16411#endif
16412
16413
16414/**
16415 * HADDPD
16416 */
16417#ifdef IEM_WITHOUT_ASSEMBLY
16418IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_haddpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16419{
16420 return iemAImpl_addpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16421 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16422}
16423#endif
16424
16425
16426/**
16427 * HSUBPS
16428 */
16429#ifdef IEM_WITHOUT_ASSEMBLY
16430IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16431{
16432 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc1->ar32[1])
16433 | iemAImpl_subps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[2], &puSrc1->ar32[3])
16434 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[0], &puSrc2->ar32[1])
16435 | iemAImpl_subps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[2], &puSrc2->ar32[3]);
16436}
16437#endif
16438
16439
16440/**
16441 * HSUBPD
16442 */
16443#ifdef IEM_WITHOUT_ASSEMBLY
16444IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_hsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16445{
16446 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc1->ar64[1])
16447 | iemAImpl_subpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[0], &puSrc2->ar64[1]);
16448}
16449#endif
16450
16451
16452/**
16453 * SQRTPS
16454 */
16455#ifdef IEM_WITHOUT_ASSEMBLY
16456static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16457{
16458 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16459 return fMxcsr;
16460
16461 RTFLOAT32U r32Src;
16462 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
16463 if (RTFLOAT32U_IS_ZERO(&r32Src))
16464 {
16465 *pr32Res = r32Src;
16466 return fMxcsr;
16467 }
16468 else if (r32Src.s.fSign)
16469 {
16470 *pr32Res = g_ar32QNaN[1];
16471 return fMxcsr | X86_MXCSR_IE;
16472 }
16473
16474 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16475 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16476 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
16477}
16478
16479
16480IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16481{
16482 RT_NOREF(puSrc1);
16483
16484 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16485 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16486 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16487 | iemAImpl_sqrtps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16488}
16489#endif
16490
16491
16492/**
16493 * SQRTSS
16494 */
16495#ifdef IEM_WITHOUT_ASSEMBLY
16496IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16497{
16498 pResult->ar32[1] = puSrc1->ar32[1];
16499 pResult->ar32[2] = puSrc1->ar32[2];
16500 pResult->ar32[3] = puSrc1->ar32[3];
16501 return iemAImpl_sqrtps_u128_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16502}
16503#endif
16504
16505
16506/**
16507 * SQRTPD
16508 */
16509#ifdef IEM_WITHOUT_ASSEMBLY
16510static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
16511{
16512 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
16513 return fMxcsr;
16514
16515 RTFLOAT64U r64Src;
16516 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
16517 if (RTFLOAT64U_IS_ZERO(&r64Src))
16518 {
16519 *pr64Res = r64Src;
16520 return fMxcsr;
16521 }
16522 else if (r64Src.s.fSign)
16523 {
16524 *pr64Res = g_ar64QNaN[1];
16525 return fMxcsr | X86_MXCSR_IE;
16526 }
16527
16528 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16529 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
16530 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
16531}
16532
16533
16534IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16535{
16536 RT_NOREF(puSrc1);
16537
16538 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar64[0])
16539 | iemAImpl_sqrtpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar64[1]);
16540}
16541#endif
16542
16543
16544/**
16545 * SQRTSD
16546 */
16547#ifdef IEM_WITHOUT_ASSEMBLY
16548IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_sqrtsd_u128_r64,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
16549{
16550 pResult->ar64[1] = puSrc1->ar64[1];
16551 return iemAImpl_sqrtpd_u128_worker(&pResult->ar64[0], uMxCsrIn, pr64Src2);
16552}
16553#endif
16554
16555
16556#ifdef IEM_WITHOUT_ASSEMBLY
16557/**
16558 * RSQRTPS
16559 */
16560static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16561{
16562 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16563 return fMxcsr;
16564
16565 RTFLOAT32U r32Src;
16566 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16567 if (RTFLOAT32U_IS_ZERO(&r32Src))
16568 {
16569 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16570 return fMxcsr;
16571 }
16572 else if (r32Src.s.fSign)
16573 {
16574 *pr32Res = g_ar32QNaN[1];
16575 return fMxcsr | X86_MXCSR_IE;
16576 }
16577
16578 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16579 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
16580 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16581}
16582
16583
16584IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16585{
16586 RT_NOREF(puSrc1);
16587
16588 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16589 | iemAImpl_rsqrt_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16590 | iemAImpl_rsqrt_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16591 | iemAImpl_rsqrt_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16592}
16593
16594
16595/**
16596 * RSQRTSS
16597 */
16598IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rsqrtss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16599{
16600 pResult->ar32[1] = puSrc1->ar32[1];
16601 pResult->ar32[2] = puSrc1->ar32[2];
16602 pResult->ar32[3] = puSrc1->ar32[3];
16603 return iemAImpl_rsqrt_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16604}
16605#endif
16606
16607
16608/**
16609 * RCPPS
16610 */
16611#ifdef IEM_WITHOUT_ASSEMBLY
16612static uint32_t iemAImpl_rcp_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
16613{
16614 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
16615 return fMxcsr;
16616
16617 RTFLOAT32U r32Src;
16618 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
16619 if (RTFLOAT32U_IS_ZERO(&r32Src))
16620 {
16621 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
16622 return fMxcsr;
16623 }
16624
16625 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16626 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&g_ar32One[0]), iemFpSoftF32FromIprt(&r32Src), &SoftState);
16627 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16628}
16629
16630
16631IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16632{
16633 RT_NOREF(puSrc1);
16634
16635 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar32[0])
16636 | iemAImpl_rcp_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar32[1])
16637 | iemAImpl_rcp_worker(&pResult->ar32[2], uMxCsrIn, &puSrc2->ar32[2])
16638 | iemAImpl_rcp_worker(&pResult->ar32[3], uMxCsrIn, &puSrc2->ar32[3]);
16639}
16640
16641
16642/**
16643 * RCPSS
16644 */
16645IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_rcpss_u128_r32,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
16646{
16647 pResult->ar32[1] = puSrc1->ar32[1];
16648 pResult->ar32[2] = puSrc1->ar32[2];
16649 pResult->ar32[3] = puSrc1->ar32[3];
16650 return iemAImpl_rcp_worker(&pResult->ar32[0], uMxCsrIn, pr32Src2);
16651}
16652#endif
16653
16654
16655/**
16656 * ADDSUBPS
16657 */
16658#ifdef IEM_WITHOUT_ASSEMBLY
16659IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16660{
16661 RT_NOREF(puSrc1);
16662
16663 return iemAImpl_subps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc1->ar32[0], &puSrc2->ar32[0])
16664 | iemAImpl_addps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc1->ar32[1], &puSrc2->ar32[1])
16665 | iemAImpl_subps_u128_worker(&pResult->ar32[2], uMxCsrIn, &puSrc1->ar32[2], &puSrc2->ar32[2])
16666 | iemAImpl_addps_u128_worker(&pResult->ar32[3], uMxCsrIn, &puSrc1->ar32[3], &puSrc2->ar32[3]);
16667}
16668#endif
16669
16670
16671/**
16672 * ADDSUBPD
16673 */
16674#ifdef IEM_WITHOUT_ASSEMBLY
16675IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_addsubpd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16676{
16677 RT_NOREF(puSrc1);
16678
16679 return iemAImpl_subpd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc1->ar64[0], &puSrc2->ar64[0])
16680 | iemAImpl_addpd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc1->ar64[1], &puSrc2->ar64[1]);
16681}
16682#endif
16683
16684
16685/**
16686 * CVTPD2PS
16687 */
16688#ifdef IEM_WITHOUT_ASSEMBLY
16689static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
16690{
16691 RTFLOAT64U r64Src1;
16692 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
16693
16694 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16695 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
16696 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16697}
16698
16699
16700IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16701{
16702 RT_NOREF(puSrc1);
16703
16704 pResult->au32[2] = 0;
16705 pResult->au32[3] = 0;
16706 return iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, &puSrc2->ar64[0])
16707 | iemAImpl_cvtpd2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, &puSrc2->ar64[1]);
16708}
16709#endif
16710
16711
16712/**
16713 * CVTPS2PD
16714 */
16715#ifdef IEM_WITHOUT_ASSEMBLY
16716static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
16717{
16718 RTFLOAT32U r32Src1;
16719 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
16720
16721 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16722 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
16723 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16724}
16725
16726
16727IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16728{
16729 RT_NOREF(puSrc1);
16730
16731 return iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, &puSrc2->ar32[0])
16732 | iemAImpl_cvtps2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, &puSrc2->ar32[1]);
16733}
16734#endif
16735
16736
16737/**
16738 * CVTDQ2PS
16739 */
16740#ifdef IEM_WITHOUT_ASSEMBLY
16741static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
16742{
16743 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16744 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
16745 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
16746}
16747
16748
16749IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2ps_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16750{
16751 RT_NOREF(puSrc1);
16752
16753 return iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[0], uMxCsrIn, puSrc2->ai32[0])
16754 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[1], uMxCsrIn, puSrc2->ai32[1])
16755 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[2], uMxCsrIn, puSrc2->ai32[2])
16756 | iemAImpl_cvtdq2ps_u128_worker(&pResult->ar32[3], uMxCsrIn, puSrc2->ai32[3]);
16757}
16758#endif
16759
16760
16761/**
16762 * CVTPS2DQ
16763 */
16764#ifdef IEM_WITHOUT_ASSEMBLY
16765static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16766{
16767 RTFLOAT32U r32Src;
16768 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16769
16770 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16771 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16772 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16773}
16774
16775
16776IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16777{
16778 RT_NOREF(puSrc1);
16779
16780 return iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16781 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16782 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16783 | iemAImpl_cvtps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16784}
16785#endif
16786
16787
16788/**
16789 * CVTTPS2DQ
16790 */
16791#ifdef IEM_WITHOUT_ASSEMBLY
16792static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16793{
16794 RTFLOAT32U r32Src;
16795 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16796
16797 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16798 SoftState.roundingMode = softfloat_round_minMag;
16799 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16800 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16801}
16802
16803
16804IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16805{
16806 RT_NOREF(puSrc1);
16807
16808 return iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar32[0])
16809 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar32[1])
16810 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[2], uMxCsrIn, &puSrc2->ar32[2])
16811 | iemAImpl_cvttps2dq_u128_worker(&pResult->ai32[3], uMxCsrIn, &puSrc2->ar32[3]);
16812}
16813#endif
16814
16815
16816/**
16817 * CVTTPD2DQ
16818 */
16819#ifdef IEM_WITHOUT_ASSEMBLY
16820static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16821{
16822 RTFLOAT64U r64Src;
16823 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16824
16825 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16826 SoftState.roundingMode = softfloat_round_minMag;
16827 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16828 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16829}
16830
16831
16832IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16833{
16834 RT_NOREF(puSrc1);
16835
16836 pResult->au64[1] = 0;
16837 return iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16838 | iemAImpl_cvttpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16839}
16840#endif
16841
16842
16843/**
16844 * CVTDQ2PD
16845 */
16846#ifdef IEM_WITHOUT_ASSEMBLY
16847static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16848{
16849 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16850 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16851 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16852}
16853
16854
16855IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtdq2pd_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16856{
16857 RT_NOREF(puSrc1);
16858
16859 return iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[0], uMxCsrIn, puSrc2->ai32[0])
16860 | iemAImpl_cvtdq2pd_u128_worker(&pResult->ar64[1], uMxCsrIn, puSrc2->ai32[1]);
16861}
16862#endif
16863
16864
16865/**
16866 * CVTPD2DQ
16867 */
16868#ifdef IEM_WITHOUT_ASSEMBLY
16869static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16870{
16871 RTFLOAT64U r64Src;
16872 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16873
16874 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16875 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16876 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16877}
16878
16879
16880IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2dq_u128,(uint32_t uMxCsrIn, PX86XMMREG pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16881{
16882 RT_NOREF(puSrc1);
16883
16884 pResult->au64[1] = 0;
16885 return iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[0], uMxCsrIn, &puSrc2->ar64[0])
16886 | iemAImpl_cvtpd2dq_u128_worker(&pResult->ai32[1], uMxCsrIn, &puSrc2->ar64[1]);
16887}
16888#endif
16889
16890
16891/**
16892 * [V]SHUFPS
16893 */
16894#ifdef IEM_WITHOUT_ASSEMBLY
16895IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16896{
16897 RTUINT128U const uSrc1 = *puDst;
16898 RTUINT128U const uSrc2 = *puSrc;
16899 ASMCompilerBarrier();
16900 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16901 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16902 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16903 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16904}
16905#endif
16906
16907
16908IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16909{
16910 RTUINT128U const uSrc1 = *puSrc1;
16911 RTUINT128U const uSrc2 = *puSrc2;
16912 ASMCompilerBarrier();
16913 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16914 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16915 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16916 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16917}
16918
16919
16920IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16921{
16922 RTUINT256U const uSrc1 = *puSrc1;
16923 RTUINT256U const uSrc2 = *puSrc2;
16924 ASMCompilerBarrier();
16925 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16926 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16927 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16928 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16929
16930 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16931 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16932 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16933 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16934}
16935
16936
16937/**
16938 * [V]SHUFPD
16939 */
16940#ifdef IEM_WITHOUT_ASSEMBLY
16941IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16942{
16943 RTUINT128U const uSrc1 = *puDst;
16944 RTUINT128U const uSrc2 = *puSrc;
16945 ASMCompilerBarrier();
16946 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16947 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16948}
16949#endif
16950
16951
16952IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16953{
16954 RTUINT128U const uSrc1 = *puSrc1;
16955 RTUINT128U const uSrc2 = *puSrc2;
16956 ASMCompilerBarrier();
16957 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16958 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16959}
16960
16961
16962IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16963{
16964 RTUINT256U const uSrc1 = *puSrc1;
16965 RTUINT256U const uSrc2 = *puSrc2;
16966 ASMCompilerBarrier();
16967 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16968 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16969 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16970 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16971}
16972
16973
16974/*
16975 * PHMINPOSUW / VPHMINPOSUW
16976 */
16977IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16978{
16979 uint16_t u16Min = puSrc->au16[0];
16980 uint8_t idxMin = 0;
16981
16982 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16983 if (puSrc->au16[i] < u16Min)
16984 {
16985 u16Min = puSrc->au16[i];
16986 idxMin = i;
16987 }
16988
16989 puDst->au64[0] = 0;
16990 puDst->au64[1] = 0;
16991 puDst->au16[0] = u16Min;
16992 puDst->au16[1] = idxMin;
16993}
16994
16995
16996IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16997{
16998 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16999}
17000
17001
17002/**
17003 * VPERMILPS
17004 */
17005#ifdef IEM_WITHOUT_ASSEMBLY
17006IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17007{
17008 RTUINT128U const uSrc = *puSrc;
17009 ASMCompilerBarrier();
17010
17011 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17012 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17013 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17014 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17015}
17016
17017
17018IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17019{
17020 RTUINT256U const uSrc = *puSrc;
17021 ASMCompilerBarrier();
17022
17023 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17024 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17025 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17026 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17027
17028 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17029 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17030 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17031 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17032}
17033
17034IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17035{
17036 RTUINT128U const uSrc1 = *puSrc1;
17037 RTUINT128U const uSrc2 = *puSrc2;
17038 ASMCompilerBarrier();
17039
17040 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17041 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17042 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17043 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17044}
17045
17046IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17047{
17048 RTUINT256U const uSrc1 = *puSrc1;
17049 RTUINT256U const uSrc2 = *puSrc2;
17050 ASMCompilerBarrier();
17051
17052 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17053 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17054 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17055 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17056
17057 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17058 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17059 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17060 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17061}
17062#endif
17063
17064
17065IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17066{
17067 RTUINT128U const uSrc = *puSrc;
17068 ASMCompilerBarrier();
17069
17070 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17071 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17072 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17073 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17074}
17075
17076
17077IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17078{
17079 RTUINT256U const uSrc = *puSrc;
17080 ASMCompilerBarrier();
17081
17082 puDst->au32[0] = uSrc.au32[bEvil & 0x3];
17083 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 0x3];
17084 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 0x3];
17085 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 0x3];
17086
17087 puDst->au32[4] = uSrc.au32[4 + (bEvil & 0x3)];
17088 puDst->au32[5] = uSrc.au32[4 + ((bEvil >> 2) & 0x3)];
17089 puDst->au32[6] = uSrc.au32[4 + ((bEvil >> 4) & 0x3)];
17090 puDst->au32[7] = uSrc.au32[4 + ((bEvil >> 6) & 0x3)];
17091}
17092
17093IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17094{
17095 RTUINT128U const uSrc1 = *puSrc1;
17096 RTUINT128U const uSrc2 = *puSrc2;
17097 ASMCompilerBarrier();
17098
17099 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17100 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17101 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17102 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17103}
17104
17105IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17106{
17107 RTUINT256U const uSrc1 = *puSrc1;
17108 RTUINT256U const uSrc2 = *puSrc2;
17109 ASMCompilerBarrier();
17110
17111 puDst->au32[0] = uSrc1.au32[uSrc2.au8[0] & 0x3];
17112 puDst->au32[1] = uSrc1.au32[uSrc2.au8[4] & 0x3];
17113 puDst->au32[2] = uSrc1.au32[uSrc2.au8[8] & 0x3];
17114 puDst->au32[3] = uSrc1.au32[uSrc2.au8[12] & 0x3];
17115
17116 puDst->au32[4] = uSrc1.au32[4 + (uSrc2.au8[16] & 0x3)];
17117 puDst->au32[5] = uSrc1.au32[4 + (uSrc2.au8[20] & 0x3)];
17118 puDst->au32[6] = uSrc1.au32[4 + (uSrc2.au8[24] & 0x3)];
17119 puDst->au32[7] = uSrc1.au32[4 + (uSrc2.au8[28] & 0x3)];
17120}
17121
17122
17123/**
17124 * VPERMILPD
17125 */
17126#ifdef IEM_WITHOUT_ASSEMBLY
17127IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17128{
17129 RTUINT128U const uSrc = *puSrc;
17130 ASMCompilerBarrier();
17131
17132 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17133 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17134}
17135
17136
17137IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17138{
17139 RTUINT256U const uSrc = *puSrc;
17140 ASMCompilerBarrier();
17141
17142 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17143 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17144
17145 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17146 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17147}
17148
17149IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17150{
17151 RTUINT128U const uSrc1 = *puSrc1;
17152 RTUINT128U const uSrc2 = *puSrc2;
17153 ASMCompilerBarrier();
17154
17155 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17156 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17157}
17158
17159IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17160{
17161 RTUINT256U const uSrc1 = *puSrc1;
17162 RTUINT256U const uSrc2 = *puSrc2;
17163 ASMCompilerBarrier();
17164
17165 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17166 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17167
17168 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17169 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17170}
17171#endif
17172
17173
17174IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17175{
17176 RTUINT128U const uSrc = *puSrc;
17177 ASMCompilerBarrier();
17178
17179 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17180 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17181}
17182
17183
17184IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_imm_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
17185{
17186 RTUINT256U const uSrc = *puSrc;
17187 ASMCompilerBarrier();
17188
17189 puDst->au64[0] = uSrc.au64[bEvil & 0x1];
17190 puDst->au64[1] = uSrc.au64[(bEvil >> 1) & 0x1];
17191
17192 puDst->au64[2] = uSrc.au64[2 + ((bEvil >> 2) & 0x1)];
17193 puDst->au64[3] = uSrc.au64[2 + ((bEvil >> 3) & 0x1)];
17194}
17195
17196IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
17197{
17198 RTUINT128U const uSrc1 = *puSrc1;
17199 RTUINT128U const uSrc2 = *puSrc2;
17200 ASMCompilerBarrier();
17201
17202 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17203 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17204}
17205
17206IEM_DECL_IMPL_DEF(void, iemAImpl_vpermilpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
17207{
17208 RTUINT256U const uSrc1 = *puSrc1;
17209 RTUINT256U const uSrc2 = *puSrc2;
17210 ASMCompilerBarrier();
17211
17212 puDst->au64[0] = uSrc1.au64[(uSrc2.au8[0] & 0x2) >> 1];
17213 puDst->au64[1] = uSrc1.au64[(uSrc2.au8[8] & 0x2) >> 1];
17214
17215 puDst->au64[2] = uSrc1.au64[2 + ((uSrc2.au8[16] & 0x2) >> 1)];
17216 puDst->au64[3] = uSrc1.au64[2 + ((uSrc2.au8[24] & 0x2) >> 1)];
17217}
17218
17219
17220/*
17221 * [V]PBLENDVB
17222 */
17223IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17224{
17225 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17226 if (puMask->au8[i] & RT_BIT(7))
17227 puDst->au8[i] = puSrc->au8[i];
17228}
17229
17230
17231IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17232{
17233 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17234 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17235}
17236
17237
17238IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17239{
17240 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17241 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
17242}
17243
17244
17245/*
17246 * [V]BLENDVPS
17247 */
17248IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17249{
17250 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17251 if (puMask->au32[i] & RT_BIT_32(31))
17252 puDst->au32[i] = puSrc->au32[i];
17253}
17254
17255
17256IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17257{
17258 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17259 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17260}
17261
17262
17263IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17264{
17265 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17266 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
17267}
17268
17269
17270/*
17271 * [V]BLENDVPD
17272 */
17273IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
17274{
17275 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
17276 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
17277}
17278
17279
17280IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
17281{
17282 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17283 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17284}
17285
17286
17287IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
17288{
17289 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17290 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
17291}
17292
17293
17294/**
17295 * [V]PALIGNR
17296 */
17297IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
17298{
17299 uint64_t const u64Src1 = *pu64Dst;
17300 ASMCompilerBarrier();
17301
17302 if (bEvil >= 16)
17303 *pu64Dst = 0;
17304 else if (bEvil >= 8)
17305 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
17306 else
17307 {
17308 uint8_t cShift = bEvil * 8;
17309 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
17310 | (u64Src2 >> cShift);
17311 }
17312}
17313
17314
17315IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17316{
17317 RTUINT128U const uSrc1 = *puDst;
17318 RTUINT128U const uSrc2 = *puSrc;
17319 ASMCompilerBarrier();
17320
17321 puDst->au64[0] = 0;
17322 puDst->au64[1] = 0;
17323 if (bEvil >= 32)
17324 { /* Everything stays 0. */ }
17325 else if (bEvil >= 16)
17326 {
17327 bEvil -= 16;
17328 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17329 puDst->au8[i - bEvil] = uSrc1.au8[i];
17330 }
17331 else
17332 {
17333 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17334 puDst->au8[i] = uSrc2.au8[i + bEvil];
17335 for (uint8_t i = 0; i < bEvil; i++)
17336 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17337 }
17338}
17339
17340
17341IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17342{
17343 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17344 RTUINT128U const uSrc2 = *puSrc2;
17345 ASMCompilerBarrier();
17346
17347 puDst->au64[0] = 0;
17348 puDst->au64[1] = 0;
17349 if (bEvil >= 32)
17350 { /* Everything stays 0. */ }
17351 else if (bEvil >= 16)
17352 {
17353 bEvil -= 16;
17354 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
17355 puDst->au8[i - bEvil] = uSrc1.au8[i];
17356 }
17357 else
17358 {
17359 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
17360 puDst->au8[i] = uSrc2.au8[i + bEvil];
17361 for (uint8_t i = 0; i < bEvil; i++)
17362 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
17363 }
17364}
17365
17366
17367IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17368{
17369 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
17370 RTUINT256U const uSrc2 = *puSrc2;
17371 ASMCompilerBarrier();
17372
17373 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
17374 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
17375}
17376
17377
17378/**
17379 * [V]PBLENDW
17380 */
17381IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17382{
17383 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17384 if (bEvil & RT_BIT(i))
17385 puDst->au16[i] = puSrc->au16[i];
17386}
17387
17388
17389IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17390{
17391 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17392 if (bEvil & RT_BIT(i))
17393 puDst->au16[i] = puSrc2->au16[i];
17394 else
17395 puDst->au16[i] = puSrc1->au16[i];
17396}
17397
17398
17399IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17400{
17401 for (uint8_t i = 0; i < 8; i++)
17402 if (bEvil & RT_BIT(i))
17403 {
17404 puDst->au16[ i] = puSrc2->au16[ i];
17405 puDst->au16[8 + i] = puSrc2->au16[8 + i];
17406 }
17407 else
17408 {
17409 puDst->au16[ i] = puSrc1->au16[ i];
17410 puDst->au16[8 + i] = puSrc1->au16[8 + i];
17411 }
17412}
17413
17414
17415/**
17416 * [V]PBLENDD
17417 */
17418IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17419{
17420 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17421 if (bEvil & RT_BIT(i))
17422 puDst->au32[i] = puSrc2->au32[i];
17423 else
17424 puDst->au32[i] = puSrc1->au32[i];
17425}
17426
17427
17428IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17429{
17430 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17431 if (bEvil & RT_BIT(i))
17432 puDst->au32[i] = puSrc2->au32[i];
17433 else
17434 puDst->au32[i] = puSrc1->au32[i];
17435}
17436
17437
17438/**
17439 * [V]BLENDPS
17440 */
17441IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17442{
17443 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17444 if (bEvil & RT_BIT(i))
17445 puDst->au32[i] = puSrc->au32[i];
17446}
17447
17448
17449IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17450{
17451 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17452 if (bEvil & RT_BIT(i))
17453 puDst->au32[i] = puSrc2->au32[i];
17454 else
17455 puDst->au32[i] = puSrc1->au32[i];
17456}
17457
17458
17459IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17460{
17461 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
17462 if (bEvil & RT_BIT(i))
17463 puDst->au32[i] = puSrc2->au32[i];
17464 else
17465 puDst->au32[i] = puSrc1->au32[i];
17466}
17467
17468
17469/**
17470 * [V]BLENDPD
17471 */
17472IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17473{
17474 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17475 if (bEvil & RT_BIT(i))
17476 puDst->au64[i] = puSrc->au64[i];
17477}
17478
17479
17480IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17481{
17482 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17483 if (bEvil & RT_BIT(i))
17484 puDst->au64[i] = puSrc2->au64[i];
17485 else
17486 puDst->au64[i] = puSrc1->au64[i];
17487}
17488
17489
17490IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
17491{
17492 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
17493 if (bEvil & RT_BIT(i))
17494 puDst->au64[i] = puSrc2->au64[i];
17495 else
17496 puDst->au64[i] = puSrc1->au64[i];
17497}
17498
17499
17500/**
17501 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
17502 */
17503
17504static uint8_t iemAImpl_aes_sbox[] = {
17505 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
17506 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
17507 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
17508 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
17509 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
17510 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
17511 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
17512 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
17513 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
17514 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
17515 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
17516 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
17517 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
17518 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
17519 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
17520 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
17521};
17522
17523/* The InvS-Box lookup table. */
17524static uint8_t iemAImpl_aes_inv_sbox[] = {
17525 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
17526 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
17527 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
17528 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
17529 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
17530 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
17531 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
17532 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
17533 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
17534 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
17535 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
17536 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
17537 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
17538 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
17539 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
17540 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
17541};
17542
17543/* The ShiftRows lookup table. */
17544static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
17545 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
17546};
17547
17548/* The InvShiftRows lookup table. */
17549static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
17550 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
17551};
17552
17553static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
17554{
17555 RTUINT128U uVal;
17556 int i;
17557
17558 for (i = 0; i < 16; ++i)
17559 uVal.au8[i] = abSubst[puSrc->au8[i]];
17560
17561 return uVal;
17562}
17563
17564static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
17565{
17566 return (u << 1) ^ (((u >> 7) & 1) * 27);
17567}
17568
17569static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
17570{
17571 RTUINT128U uVal;
17572 int i;
17573 uint8_t tmp;
17574
17575 for (i = 0; i < 16; i += 4) {
17576 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
17577 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
17578 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
17579 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
17580 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
17581 }
17582
17583 return uVal;
17584}
17585
17586static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
17587{
17588 RTUINT128U uVal;
17589 int i;
17590
17591 for (i = 0; i < 16; ++i)
17592 uVal.au8[i] = puSrc->au8[abShift[i]];
17593
17594 return uVal;
17595}
17596
17597static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
17598{
17599 uint8_t val;
17600
17601 val = ((b >> 0) & 1) * a;
17602 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
17603 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
17604 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
17605 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
17606
17607 return val;
17608}
17609
17610static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
17611{
17612 RTUINT128U uVal;
17613 int i;
17614
17615 for (i = 0; i < 16; i += 4) {
17616 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
17617 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
17618 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
17619 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
17620 }
17621
17622 return uVal;
17623}
17624
17625static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
17626{
17627 RTUINT32U uTmp;
17628
17629 uTmp.au32[0] = w;
17630 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
17631 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
17632 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
17633 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
17634
17635 return uTmp.au32[0];
17636}
17637
17638static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
17639{
17640 return (w << 24) | (w >> 8);
17641}
17642
17643/**
17644 * [V]AESKEYGENASSIST
17645 */
17646IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
17647{
17648 RTUINT128U uTmp;
17649 uint32_t uRCon = bImm; /* Round constant. */
17650
17651 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
17652 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
17653 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
17654 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
17655
17656 *puDst = uTmp;
17657}
17658
17659
17660/**
17661 * [V]AESIMC
17662 */
17663IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17664{
17665 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
17666}
17667
17668
17669/**
17670 * [V]AESENC
17671 */
17672IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17673{
17674 RTUINT128U uTmp;
17675
17676 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17677 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17678 uTmp = iemAImpl_aes_mix_col(&uTmp);
17679 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17680 uTmp.au64[1] ^= puSrc->au64[1];
17681
17682 *puDst = uTmp;
17683}
17684
17685
17686/**
17687 * [V]AESENCLAST
17688 */
17689IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17690{
17691 RTUINT128U uTmp;
17692
17693 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
17694 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
17695 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17696 uTmp.au64[1] ^= puSrc->au64[1];
17697
17698 *puDst = uTmp;
17699}
17700
17701
17702/**
17703 * [V]AESDEC
17704 */
17705IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17706{
17707 RTUINT128U uTmp;
17708
17709 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17710 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17711 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
17712 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17713 uTmp.au64[1] ^= puSrc->au64[1];
17714
17715 *puDst = uTmp;
17716}
17717
17718
17719/**
17720 * [V]AESDECLAST
17721 */
17722IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
17723{
17724 RTUINT128U uTmp;
17725
17726 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
17727 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
17728 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
17729 uTmp.au64[1] ^= puSrc->au64[1];
17730
17731 *puDst = uTmp;
17732}
17733
17734
17735/**
17736 * [V]PCMPISTRI
17737 */
17738
17739/**
17740 * Does the comparisons based on the mode and source input format.
17741 */
17742static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
17743{
17744#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
17745 do \
17746 { \
17747 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
17748 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
17749 { \
17750 switch (a_bAggOp) \
17751 { \
17752 case 0: \
17753 case 2: \
17754 case 3: \
17755 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17756 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
17757 break; \
17758 case 1: \
17759 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17760 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
17761 break; \
17762 default: \
17763 AssertReleaseFailed(); \
17764 } \
17765 } \
17766 } while(0)
17767
17768 uint8_t bAggOp = (bImm >> 2) & 0x3;
17769 switch (bImm & 0x3)
17770 {
17771 case 0:
17772 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
17773 break;
17774 case 1:
17775 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
17776 break;
17777 case 2:
17778 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
17779 break;
17780 case 3:
17781 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
17782 break;
17783 default:
17784 AssertReleaseFailed();
17785 }
17786#undef PCMPXSTRX_CMP_CASE
17787}
17788
17789static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
17790{
17791 if (bImm & 0x1)
17792 {
17793 /* Words -> 8 elements. */
17794 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
17795 if (puSrc->au16[i] == 0)
17796 return i;
17797
17798 return 8;
17799 }
17800 else
17801 {
17802 /* Bytes -> 16 elements. */
17803 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
17804 if (puSrc->au8[i] == 0)
17805 return i;
17806
17807 return 16;
17808 }
17809}
17810
17811static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
17812{
17813 if (bImm & 0x1)
17814 {
17815 if (i64Len > -8 && i64Len < 8)
17816 return RT_ABS(i64Len);
17817
17818 return 8;
17819 }
17820 else
17821 {
17822 if (i64Len > -16 && i64Len < 16)
17823 return RT_ABS(i64Len);
17824
17825 return 16;
17826 }
17827}
17828
17829/**
17830 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
17831 */
17832static const bool g_afCmpOverride[4][4] =
17833{
17834 /* xmm1 AND xmm2/m128 invalid, xmm1 invalid BUT xmm2/m128 valid, xmm1 valid BUT xmm2/m128 invalid, unused dummy/padding for parfait */
17835 { false, false, false, false }, /* Imm8[3:2] = 00b (equal any) */
17836 { false, false, false, false }, /* Imm8[3:2] = 01b (ranges) */
17837 { true, false, false, false }, /* Imm8[3:2] = 10b (equal each) */
17838 { true, true, false, false }, /* Imm8[3:2] = 11b (equal ordered) */
17839};
17840
17841DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
17842{
17843 if (fSrc1Valid && fSrc2Valid)
17844 return fCmpRes;
17845
17846 uint8_t const bSrc1Valid = fSrc1Valid ? 2 : 0;
17847 uint8_t const bSrc2Valid = fSrc2Valid ? 1 : 0;
17848 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
17849}
17850
17851static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
17852{
17853 uint8_t bAggOp = (bImm >> 2) & 0x3;
17854 uint16_t u16Result = 0;
17855
17856 switch (bAggOp)
17857 {
17858 case 0: /* Equal any */
17859 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17860 {
17861 uint16_t u16Res = 0;
17862 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
17863 {
17864 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17865 idxSrc1 < idxLen1,
17866 idxSrc2 < idxLen2,
17867 bAggOp))
17868 {
17869 u16Res = RT_BIT(idxSrc2);
17870 break;
17871 }
17872 }
17873
17874 u16Result |= u16Res;
17875 }
17876 break;
17877
17878 case 1: /* Ranges */
17879 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17880 {
17881 uint16_t u16Res = 0;
17882 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
17883 {
17884 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
17885 idxSrc1 < idxLen1,
17886 idxSrc2 < idxLen2,
17887 bAggOp)
17888 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
17889 (idxSrc1 + 1) < idxLen1,
17890 idxSrc2 < idxLen2,
17891 bAggOp))
17892 {
17893 u16Res = RT_BIT(idxSrc2);
17894 break;
17895 }
17896 }
17897
17898 u16Result |= u16Res;
17899 }
17900 break;
17901
17902 case 2: /* Equal each */
17903 for (uint8_t i = 0; i < cElems; i++)
17904 {
17905 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
17906 i < idxLen1,
17907 i < idxLen2,
17908 bAggOp))
17909 u16Result |= RT_BIT(i);
17910 }
17911 break;
17912
17913 case 3: /* Equal ordered */
17914 u16Result = 0;
17915 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
17916 {
17917 uint16_t u16Res = RT_BIT(idxSrc2);
17918 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
17919 {
17920 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
17921 idxSrc1 < idxLen1,
17922 k < idxLen2,
17923 bAggOp))
17924 {
17925 u16Res = 0;
17926 break;
17927 }
17928 }
17929
17930 u16Result |= u16Res;
17931 }
17932 break;
17933 }
17934
17935 /* Polarity selection. */
17936 switch ((bImm >> 4) & 0x3)
17937 {
17938 case 0:
17939 case 2:
17940 /* Nothing to do. */
17941 break;
17942 case 1:
17943 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
17944 break;
17945 case 3:
17946 u16Result ^= RT_BIT(idxLen2) - 1;
17947 break;
17948 default:
17949 AssertReleaseFailed();
17950 }
17951
17952 return u16Result;
17953}
17954
17955DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
17956{
17957 uint32_t fEFlags = 0;
17958
17959 if (u16Result)
17960 fEFlags |= X86_EFL_CF;
17961 if (cLen2 < cElems)
17962 fEFlags |= X86_EFL_ZF;
17963 if (cLen1 < cElems)
17964 fEFlags |= X86_EFL_SF;
17965 if (u16Result & 0x1)
17966 fEFlags |= X86_EFL_OF;
17967 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
17968}
17969
17970DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
17971 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
17972{
17973 bool afCmpRes[16][16];
17974 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17975
17976 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
17977 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
17978 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
17979
17980 return u16Result;
17981}
17982
17983DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17984{
17985 if (bImm & RT_BIT(6))
17986 {
17987 /* Index for MSB set. */
17988 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
17989 if (idxMsb)
17990 *pu32Ecx = idxMsb - 1;
17991 else
17992 *pu32Ecx = cElems;
17993 }
17994 else
17995 {
17996 /* Index for LSB set. */
17997 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17998 if (idxLsb)
17999 *pu32Ecx = idxLsb - 1;
18000 else
18001 *pu32Ecx = cElems;
18002 }
18003}
18004
18005IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18006{
18007 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18008 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18009 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18010
18011 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18012 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18013}
18014
18015
18016/**
18017 * [V]PCMPESTRI
18018 */
18019IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18020{
18021 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18022 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18023 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18024
18025 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18026 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
18027}
18028
18029
18030/**
18031 * [V]PCMPISTRM
18032 */
18033DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
18034{
18035 if (bImm & RT_BIT(6))
18036 {
18037 /* Generate a mask. */
18038 if (cElems == 8)
18039 {
18040 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18041 if (u16Result & RT_BIT(i))
18042 puDst->au16[i] = 0xffff;
18043 else
18044 puDst->au16[i] = 0;
18045 }
18046 else
18047 {
18048 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
18049 if (u16Result & RT_BIT(i))
18050 puDst->au8[i] = 0xff;
18051 else
18052 puDst->au8[i] = 0;
18053 }
18054 }
18055 else
18056 {
18057 /* Store the result. */
18058 puDst->au64[0] = u16Result;
18059 puDst->au64[1] = 0;
18060 }
18061}
18062
18063IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
18064{
18065 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18066 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
18067 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
18068
18069 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18070 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18071}
18072
18073
18074/**
18075 * [V]PCMPESTRM
18076 */
18077IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
18078{
18079 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
18080 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
18081 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
18082
18083 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
18084 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
18085}
18086
18087
18088/*
18089 * [V]PCLMULQDQ
18090 */
18091IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18092{
18093 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
18094}
18095
18096
18097IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
18098{
18099 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
18100 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
18101
18102 puDst->au64[0] = 0;
18103 puDst->au64[1] = 0;
18104
18105 /*
18106 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
18107 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
18108 * and squeeze out some optimizations.
18109 */
18110 if (uSrc1 & 0x1)
18111 puDst->au64[0] = uSrc2;
18112
18113 uSrc1 >>= 1;
18114
18115 uint8_t iDigit = 1;
18116 while (uSrc1)
18117 {
18118 if (uSrc1 & 0x1)
18119 {
18120 puDst->au64[0] ^= (uSrc2 << iDigit);
18121 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
18122 }
18123
18124 uSrc1 >>= 1;
18125 iDigit++;
18126 }
18127}
18128
18129
18130/**
18131 * [V]MOVMSKPS
18132 */
18133#ifdef IEM_WITHOUT_ASSEMBLY
18134IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18135{
18136 *pu8Dst = puSrc->au32[0] >> 31;
18137 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18138 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18139 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18140}
18141
18142#endif
18143
18144IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18145{
18146 *pu8Dst = puSrc->au32[0] >> 31;
18147 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18148 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18149 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18150}
18151
18152
18153IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18154{
18155 *pu8Dst = puSrc->au32[0] >> 31;
18156 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
18157 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
18158 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
18159 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
18160 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
18161 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
18162 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
18163}
18164
18165
18166/**
18167 * [V]MOVMSKPD
18168 */
18169#ifdef IEM_WITHOUT_ASSEMBLY
18170IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18171{
18172 *pu8Dst = puSrc->au64[0] >> 63;
18173 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18174}
18175
18176#endif
18177
18178IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
18179{
18180 *pu8Dst = puSrc->au64[0] >> 63;
18181 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18182}
18183
18184
18185IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
18186{
18187 *pu8Dst = puSrc->au64[0] >> 63;
18188 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
18189 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
18190 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
18191}
18192
18193
18194/**
18195 * CVTTSD2SI
18196 */
18197#ifdef IEM_WITHOUT_ASSEMBLY
18198IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18199{
18200 RTFLOAT64U r64Src;
18201
18202 r64Src.u = *pu64Src;
18203 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18204
18205 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18206 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18207 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18208}
18209
18210
18211IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18212{
18213 RTFLOAT64U r64Src;
18214
18215 r64Src.u = *pu64Src;
18216 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18217
18218 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18219 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18220 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18221}
18222#endif
18223
18224
18225/**
18226 * CVTSD2SI
18227 */
18228#ifdef IEM_WITHOUT_ASSEMBLY
18229IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i32_r64,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint64_t *pu64Src))
18230{
18231 RTFLOAT64U r64Src;
18232
18233 r64Src.u = *pu64Src;
18234 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18235
18236 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18237 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18238 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18239}
18240
18241
18242IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsd2si_i64_r64,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint64_t *pu64Src))
18243{
18244 RTFLOAT64U r64Src;
18245
18246 r64Src.u = *pu64Src;
18247 iemSsePrepareValueR64(&r64Src, uMxCsrIn, &r64Src); /* The de-normal flag is not set. */
18248
18249 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18250 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18251 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18252}
18253#endif
18254
18255
18256/**
18257 * CVTTSS2SI
18258 */
18259#ifdef IEM_WITHOUT_ASSEMBLY
18260IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18261{
18262 RTFLOAT32U r32Src;
18263
18264 r32Src.u = *pu32Src;
18265 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18266
18267 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18268 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18269 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18270}
18271
18272
18273IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18274{
18275 RTFLOAT32U r32Src;
18276
18277 r32Src.u = *pu32Src;
18278 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18279
18280 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18281 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18282 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18283}
18284#endif
18285
18286
18287/**
18288 * CVTSS2SI
18289 */
18290#ifdef IEM_WITHOUT_ASSEMBLY
18291IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i32_r32,(uint32_t uMxCsrIn, int32_t *pi32Dst, const uint32_t *pu32Src))
18292{
18293 RTFLOAT32U r32Src;
18294
18295 r32Src.u = *pu32Src;
18296 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18297
18298 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18299 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18300 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18301}
18302
18303
18304IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtss2si_i64_r32,(uint32_t uMxCsrIn, int64_t *pi64Dst, const uint32_t *pu32Src))
18305{
18306 RTFLOAT32U r32Src;
18307
18308 r32Src.u = *pu32Src;
18309 iemSsePrepareValueR32(&r32Src, uMxCsrIn, &r32Src); /* The de-normal flag is not set. */
18310
18311 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18312 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18313 return uMxCsrIn | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18314}
18315#endif
18316
18317
18318/**
18319 * CVTSI2SD
18320 */
18321#ifdef IEM_WITHOUT_ASSEMBLY
18322IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i32,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
18323{
18324 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18325 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
18326 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18327}
18328
18329
18330IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2sd_r64_i64,(uint32_t uMxCsrIn, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
18331{
18332 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18333 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
18334 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, uMxCsrIn);
18335}
18336#endif
18337
18338
18339/**
18340 * CVTSI2SS
18341 */
18342#ifdef IEM_WITHOUT_ASSEMBLY
18343IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i32,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
18344{
18345 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18346 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
18347 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18348}
18349
18350
18351IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtsi2ss_r32_i64,(uint32_t uMxCsrIn, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
18352{
18353 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18354 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
18355 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, uMxCsrIn);
18356}
18357#endif
18358
18359
18360/**
18361 * [V]UCOMISS
18362 */
18363#ifdef IEM_WITHOUT_ASSEMBLY
18364IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18365{
18366 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18367
18368 if (RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2))
18369 {
18370 uMxCsrIn |= X86_MXCSR_IE;
18371 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18372 }
18373 else if (RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18374 {
18375 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18376 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18377 }
18378 else
18379 {
18380 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18381
18382 RTFLOAT32U r32Src1, r32Src2;
18383 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1);
18384 fDe |= iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18385
18386 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18387 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18388 if (f32_eq(f32Src1, f32Src2, &SoftState))
18389 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18390 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18391 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18392 /* else: GREATER_THAN 000 */
18393
18394 uMxCsrIn |= fDe;
18395 }
18396
18397 *pfEFlags = fEFlagsNew;
18398 return uMxCsrIn;
18399}
18400#endif
18401
18402IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18403{
18404 return iemAImpl_ucomiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18405}
18406
18407
18408/**
18409 * [V]UCOMISD
18410 */
18411#ifdef IEM_WITHOUT_ASSEMBLY
18412IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_ucomisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18413{
18414 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18415
18416 if (RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2))
18417 {
18418 uMxCsrIn |= X86_MXCSR_IE;
18419 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18420 }
18421 else if (RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18422 {
18423 /* ucomiss doesn't raise \#IE for quiet NaNs. */
18424 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18425 }
18426 else
18427 {
18428 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18429
18430 RTFLOAT64U r64Src1, r64Src2;
18431 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1)
18432 | iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18433
18434 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18435 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18436 if (f64_eq(f64Src1, f64Src2, &SoftState))
18437 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18438 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18439 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18440 /* else: GREATER_THAN 000 */
18441
18442 uMxCsrIn |= fDe;
18443 }
18444
18445 *pfEFlags = fEFlagsNew;
18446 return uMxCsrIn;
18447}
18448#endif
18449
18450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vucomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18451{
18452 return iemAImpl_ucomisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18453}
18454
18455
18456/**
18457 * [V]COMISS
18458 */
18459#ifdef IEM_WITHOUT_ASSEMBLY
18460IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comiss_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18461{
18462 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18463
18464 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT32U_IS_SIGNALLING_NAN(&uSrc2)
18465 || RTFLOAT32U_IS_QUIET_NAN(&uSrc1) || RTFLOAT32U_IS_QUIET_NAN(&uSrc2))
18466 {
18467 uMxCsrIn |= X86_MXCSR_IE;
18468 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18469 }
18470 else
18471 {
18472 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18473
18474 RTFLOAT32U r32Src1, r32Src2;
18475 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, uMxCsrIn, &uSrc1)
18476 | iemSsePrepareValueR32(&r32Src2, uMxCsrIn, &uSrc2);
18477
18478 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18479 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18480 if (f32_eq(f32Src1, f32Src2, &SoftState))
18481 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18482 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18483 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18484 /* else: GREATER_THAN 000 */
18485
18486 uMxCsrIn |= fDe;
18487 }
18488
18489 *pfEFlags = fEFlagsNew;
18490 return uMxCsrIn;
18491}
18492#endif
18493
18494
18495IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomiss_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT32U uSrc1, RTFLOAT32U uSrc2))
18496{
18497 return iemAImpl_comiss_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18498}
18499
18500
18501/**
18502 * [V]COMISD
18503 */
18504#ifdef IEM_WITHOUT_ASSEMBLY
18505IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_comisd_u128,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18506{
18507 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
18508
18509 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc1) || RTFLOAT64U_IS_SIGNALLING_NAN(&uSrc2)
18510 || RTFLOAT64U_IS_QUIET_NAN(&uSrc1) || RTFLOAT64U_IS_QUIET_NAN(&uSrc2))
18511 {
18512 uMxCsrIn |= X86_MXCSR_IE;
18513 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
18514 }
18515 else
18516 {
18517 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(uMxCsrIn);
18518
18519 RTFLOAT64U r64Src1, r64Src2;
18520 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, uMxCsrIn, &uSrc1);
18521 fDe |= iemSsePrepareValueR64(&r64Src2, uMxCsrIn, &uSrc2);
18522
18523 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18524 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18525 if (f64_eq(f64Src1, f64Src2, &SoftState))
18526 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
18527 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18528 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
18529 /* else: GREATER_THAN 000 */
18530
18531 uMxCsrIn |= fDe;
18532 }
18533
18534 *pfEFlags = fEFlagsNew;
18535 return uMxCsrIn;
18536}
18537#endif
18538
18539IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_vcomisd_u128_fallback,(uint32_t uMxCsrIn, uint32_t *pfEFlags, RTFLOAT64U uSrc1, RTFLOAT64U uSrc2))
18540{
18541 return iemAImpl_comisd_u128(uMxCsrIn, pfEFlags, uSrc1, uSrc2);
18542}
18543
18544
18545/**
18546 * CMPPS / CMPPD / CMPSS / CMPSD
18547 */
18548#ifdef IEM_WITHOUT_ASSEMBLY
18549/**
18550 * A compare truth table entry.
18551 */
18552typedef struct CMPTRUTHTBLENTRY
18553{
18554 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
18555 bool fSignalsOnQNan;
18556 /** The boolean result when the input operands are unordered. */
18557 bool fUnordered;
18558 /** The boolean result when A = B. */
18559 bool fEqual;
18560 /** The boolean result when A < B. */
18561 bool fLowerThan;
18562 /** The boolean result when A > B. */
18563 bool fGreaterThan;
18564} CMPTRUTHTBLENTRY;
18565/** Pointer to a const truth table entry. */
18566typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
18567
18568
18569/** The compare truth table (indexed by immediate). */
18570static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
18571{
18572 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
18573 /* 00H (EQ_OQ) */ { false, false, true, false, false },
18574 /* 01H (LT_OS) */ { true, false, false, true, false },
18575 /* 02H (LE_OS) */ { true, false, true, true, false },
18576 /* 03H (UNORD_Q) */ { false, true, false, false, false },
18577 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
18578 /* 05H (NLT_US) */ { true, true, true, false, true },
18579 /* 06H (NLE_US) */ { true, true, false, false, true },
18580 /* 07H (ORQ_Q) */ { false, false, true, true, true },
18581 /** @todo AVX variants. */
18582};
18583
18584
18585static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
18586{
18587 bool fRes;
18588 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18589
18590 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
18591 {
18592 *pfMxcsr |= X86_MXCSR_IE;
18593 fRes = g_aCmpTbl[bEvil].fUnordered;
18594 }
18595 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
18596 {
18597 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18598 *pfMxcsr |= X86_MXCSR_IE;
18599 fRes = g_aCmpTbl[bEvil].fUnordered;
18600 }
18601 else
18602 {
18603 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18604
18605 RTFLOAT32U r32Src1, r32Src2;
18606 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
18607 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
18608
18609 *pfMxcsr |= fDe;
18610 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
18611 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
18612 if (f32_eq(f32Src1, f32Src2, &SoftState))
18613 fRes = g_aCmpTbl[bEvil].fEqual;
18614 else if (f32_lt(f32Src1, f32Src2, &SoftState))
18615 fRes = g_aCmpTbl[bEvil].fLowerThan;
18616 else
18617 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18618 }
18619
18620 return fRes;
18621}
18622
18623
18624static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
18625{
18626 bool fRes;
18627 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
18628
18629 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
18630 {
18631 *pfMxcsr |= X86_MXCSR_IE;
18632 fRes = g_aCmpTbl[bEvil].fUnordered;
18633 }
18634 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
18635 {
18636 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
18637 *pfMxcsr |= X86_MXCSR_IE;
18638 fRes = g_aCmpTbl[bEvil].fUnordered;
18639 }
18640 else
18641 {
18642 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
18643
18644 RTFLOAT64U r64Src1, r64Src2;
18645 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
18646 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
18647
18648 *pfMxcsr |= fDe;
18649 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
18650 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
18651 if (f64_eq(f64Src1, f64Src2, &SoftState))
18652 fRes = g_aCmpTbl[bEvil].fEqual;
18653 else if (f64_lt(f64Src1, f64Src2, &SoftState))
18654 fRes = g_aCmpTbl[bEvil].fLowerThan;
18655 else
18656 fRes = g_aCmpTbl[bEvil].fGreaterThan;
18657 }
18658
18659 return fRes;
18660}
18661
18662
18663IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpps_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18664{
18665 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18666 {
18667 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
18668 puDst->au32[i] = UINT32_MAX;
18669 else
18670 puDst->au32[i] = 0;
18671 }
18672
18673 return uMxCsrIn;
18674}
18675
18676
18677IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmppd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18678{
18679 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18680 {
18681 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
18682 puDst->au64[i] = UINT64_MAX;
18683 else
18684 puDst->au64[i] = 0;
18685 }
18686
18687 return uMxCsrIn;
18688}
18689
18690
18691IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18692{
18693 if (iemAImpl_cmp_worker_r32(&uMxCsrIn, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
18694 puDst->au32[0] = UINT32_MAX;
18695 else
18696 puDst->au32[0] = 0;
18697
18698 puDst->au32[1] = pSrc->uSrc1.au32[1];
18699 puDst->au64[1] = pSrc->uSrc1.au64[1];
18700 return uMxCsrIn;
18701}
18702
18703
18704IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cmpsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
18705{
18706 if (iemAImpl_cmp_worker_r64(&uMxCsrIn, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
18707 puDst->au64[0] = UINT64_MAX;
18708 else
18709 puDst->au64[0] = 0;
18710
18711 puDst->au64[1] = pSrc->uSrc1.au64[1];
18712 return uMxCsrIn;
18713}
18714#endif
18715
18716
18717/**
18718 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
18719 */
18720
18721#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
18722#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
18723#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
18724
18725#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
18726
18727DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
18728{
18729 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
18730 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18731
18732 fMxcsr &= ~X86_MXCSR_RC_MASK;
18733 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
18734 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18735}
18736
18737static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
18738{
18739 RTFLOAT32U r32Src, r32Dst;
18740 float32_t f32Src;
18741 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18742 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18743
18744 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
18745 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
18746
18747 iemFpSoftF32ToIprt(&r32Dst, f32Src);
18748 return r32Dst;
18749}
18750
18751static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
18752{
18753 RTFLOAT64U r64Src, r64Dst;
18754 float64_t f64Src;
18755 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
18756 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
18757
18758 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
18759 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
18760
18761 iemFpSoftF64ToIprt(&r64Dst, f64Src);
18762 return r64Dst;
18763}
18764
18765#ifdef IEM_WITHOUT_ASSEMBLY
18766IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundss_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18767{
18768 puDst->ar32[0] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18769 puDst->au32[1] = pSrc->uSrc1.au32[1];
18770 puDst->au64[1] = pSrc->uSrc1.au64[1];
18771 return uMxCsrIn;
18772}
18773
18774
18775IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundsd_u128,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18776{
18777 puDst->ar64[0] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18778 puDst->au64[1] = pSrc->uSrc1.au64[1];
18779 return uMxCsrIn;
18780}
18781#endif
18782
18783IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18784{
18785 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
18786 {
18787 puDst->ar32[i] = iemAImpl_round_worker_r32(&uMxCsrIn, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18788 }
18789
18790 return uMxCsrIn;
18791}
18792
18793
18794IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_roundpd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18795{
18796 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
18797 {
18798 puDst->ar64[i] = iemAImpl_round_worker_r64(&uMxCsrIn, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
18799 }
18800
18801 return uMxCsrIn;
18802}
18803
18804/**
18805 * CVTPD2PI
18806 */
18807#ifdef IEM_WITHOUT_ASSEMBLY
18808static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18809{
18810 RTFLOAT64U r64Src;
18811 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18812
18813 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18814 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18815 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18816}
18817
18818
18819IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18820{
18821 RTUINT64U u64Res;
18822 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18823 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18824
18825 *pu64Dst = u64Res.u;
18826 return fMxcsrOut;
18827}
18828#endif
18829
18830
18831/**
18832 * CVTTPD2PI
18833 */
18834#ifdef IEM_WITHOUT_ASSEMBLY
18835static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
18836{
18837 RTFLOAT64U r64Src;
18838 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
18839
18840 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18841 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
18842 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18843}
18844
18845
18846IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttpd2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, PCX86XMMREG pSrc))
18847{
18848 RTUINT64U u64Res;
18849 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[0], &pSrc->ar64[0]);
18850 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(fMxCsrIn, &u64Res.ai32[1], &pSrc->ar64[1]);
18851
18852 *pu64Dst = u64Res.u;
18853 return fMxcsrOut;
18854}
18855#endif
18856
18857
18858/**
18859 * CVTPI2PS
18860 */
18861#ifdef IEM_WITHOUT_ASSEMBLY
18862static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
18863{
18864 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18865 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
18866 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
18867}
18868
18869
18870IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2ps_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18871{
18872 RTUINT64U uSrc = { u64Src };
18873 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[0], uSrc.ai32[0]);
18874 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(fMxCsrIn, &pDst->ar32[1], uSrc.ai32[1]);
18875 return fMxcsrOut;
18876}
18877#endif
18878
18879
18880/**
18881 * CVTPI2PD
18882 */
18883#ifdef IEM_WITHOUT_ASSEMBLY
18884static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
18885{
18886 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18887 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
18888 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
18889}
18890
18891
18892IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtpi2pd_u128,(uint32_t fMxCsrIn, PX86XMMREG pDst, uint64_t u64Src))
18893{
18894 RTUINT64U uSrc = { u64Src };
18895 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[0], uSrc.ai32[0]);
18896 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(fMxCsrIn, &pDst->ar64[1], uSrc.ai32[1]);
18897 return fMxcsrOut;
18898}
18899#endif
18900
18901
18902/**
18903 * CVTPS2PI
18904 */
18905#ifdef IEM_WITHOUT_ASSEMBLY
18906static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18907{
18908 RTFLOAT32U r32Src;
18909 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18910
18911 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18912 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
18913 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18914}
18915
18916
18917IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvtps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
18918{
18919 RTUINT64U uDst;
18920 RTUINT64U uSrc = { u64Src };
18921 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18922 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18923 *pu64Dst = uDst.u;
18924 return fMxcsrOut;
18925}
18926#endif
18927
18928
18929/**
18930 * CVTTPS2PI
18931 */
18932#ifdef IEM_WITHOUT_ASSEMBLY
18933static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
18934{
18935 RTFLOAT32U r32Src;
18936 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
18937
18938 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
18939 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
18940 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
18941}
18942
18943
18944IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_cvttps2pi_u128,(uint32_t fMxCsrIn, uint64_t *pu64Dst, uint64_t u64Src))
18945{
18946 RTUINT64U uDst;
18947 RTUINT64U uSrc = { u64Src };
18948 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
18949 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(fMxCsrIn, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
18950 *pu64Dst = uDst.u;
18951 return fMxcsrOut;
18952}
18953#endif
18954
18955/**
18956 * RDRAND
18957 */
18958IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18959{
18960 *puDst = 0;
18961 *pEFlags &= ~X86_EFL_STATUS_BITS;
18962 *pEFlags |= X86_EFL_CF;
18963}
18964
18965IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18966{
18967 *puDst = 0;
18968 *pEFlags &= ~X86_EFL_STATUS_BITS;
18969 *pEFlags |= X86_EFL_CF;
18970}
18971
18972IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18973{
18974 *puDst = 0;
18975 *pEFlags &= ~X86_EFL_STATUS_BITS;
18976 *pEFlags |= X86_EFL_CF;
18977}
18978
18979/**
18980 * RDSEED
18981 */
18982IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18983{
18984 *puDst = 0;
18985 *pEFlags &= ~X86_EFL_STATUS_BITS;
18986 *pEFlags |= X86_EFL_CF;
18987}
18988
18989IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18990{
18991 *puDst = 0;
18992 *pEFlags &= ~X86_EFL_STATUS_BITS;
18993 *pEFlags |= X86_EFL_CF;
18994}
18995
18996IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18997{
18998 *puDst = 0;
18999 *pEFlags &= ~X86_EFL_STATUS_BITS;
19000 *pEFlags |= X86_EFL_CF;
19001}
19002
19003
19004/**
19005 * SHA1NEXTE
19006 */
19007IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19008{
19009 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
19010
19011 puDst->au32[0] = puSrc->au32[0];
19012 puDst->au32[1] = puSrc->au32[1];
19013 puDst->au32[2] = puSrc->au32[2];
19014 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
19015}
19016
19017/**
19018 * SHA1MSG1
19019 */
19020IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19021{
19022 uint32_t u32W0 = puDst->au32[3];
19023 uint32_t u32W1 = puDst->au32[2];
19024 uint32_t u32W2 = puDst->au32[1];
19025 uint32_t u32W3 = puDst->au32[0];
19026 uint32_t u32W4 = puSrc->au32[3];
19027 uint32_t u32W5 = puSrc->au32[2];
19028
19029 puDst->au32[3] = u32W2 ^ u32W0;
19030 puDst->au32[2] = u32W3 ^ u32W1;
19031 puDst->au32[1] = u32W4 ^ u32W2;
19032 puDst->au32[0] = u32W5 ^ u32W3;
19033}
19034
19035/**
19036 * SHA1MSG2
19037 */
19038IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19039{
19040 uint32_t u32W13 = puSrc->au32[2];
19041 uint32_t u32W14 = puSrc->au32[1];
19042 uint32_t u32W15 = puSrc->au32[0];
19043 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
19044 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
19045 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
19046 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
19047
19048 puDst->au32[3] = u32W16;
19049 puDst->au32[2] = u32W17;
19050 puDst->au32[1] = u32W18;
19051 puDst->au32[0] = u32W19;
19052}
19053
19054/**
19055 * SHA1RNDS4
19056 */
19057typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
19058typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
19059
19060static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19061{
19062 return (u32B & u32C) ^ (~u32B & u32D);
19063}
19064
19065static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19066{
19067 return u32B ^ u32C ^ u32D;
19068}
19069
19070static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19071{
19072 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
19073}
19074
19075static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
19076{
19077 return u32B ^ u32C ^ u32D;
19078}
19079
19080IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19081{
19082 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
19083 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
19084
19085 uint32_t au32A[5];
19086 uint32_t au32B[5];
19087 uint32_t au32C[5];
19088 uint32_t au32D[5];
19089 uint32_t au32E[5];
19090 uint32_t au32W[4];
19091 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
19092 uint32_t u32K = s_au32K[bEvil & 0x3];
19093
19094 au32A[0] = puDst->au32[3];
19095 au32B[0] = puDst->au32[2];
19096 au32C[0] = puDst->au32[1];
19097 au32D[0] = puDst->au32[0];
19098 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
19099 au32W[i] = puSrc->au32[3 - i];
19100
19101 /* Round 0 is a bit different than the other rounds. */
19102 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
19103 au32B[1] = au32A[0];
19104 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
19105 au32D[1] = au32C[0];
19106 au32E[1] = au32D[0];
19107
19108 for (uint32_t i = 1; i <= 3; i++)
19109 {
19110 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
19111 au32B[i + 1] = au32A[i];
19112 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
19113 au32D[i + 1] = au32C[i];
19114 au32E[i + 1] = au32D[i];
19115 }
19116
19117 puDst->au32[3] = au32A[4];
19118 puDst->au32[2] = au32B[4];
19119 puDst->au32[1] = au32C[4];
19120 puDst->au32[0] = au32D[4];
19121}
19122
19123
19124/**
19125 * SHA256MSG1
19126 */
19127DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
19128{
19129 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
19130}
19131
19132IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19133{
19134 uint32_t u32W4 = puSrc->au32[0];
19135 uint32_t u32W3 = puDst->au32[3];
19136 uint32_t u32W2 = puDst->au32[2];
19137 uint32_t u32W1 = puDst->au32[1];
19138 uint32_t u32W0 = puDst->au32[0];
19139
19140 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
19141 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
19142 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
19143 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
19144}
19145
19146/**
19147 * SHA256MSG2
19148 */
19149DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
19150{
19151 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
19152}
19153
19154IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
19155{
19156 uint32_t u32W14 = puSrc->au32[2];
19157 uint32_t u32W15 = puSrc->au32[3];
19158 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
19159 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
19160 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
19161 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
19162
19163 puDst->au32[3] = u32W19;
19164 puDst->au32[2] = u32W18;
19165 puDst->au32[1] = u32W17;
19166 puDst->au32[0] = u32W16;
19167}
19168
19169/**
19170 * SHA256RNDS2
19171 */
19172DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19173{
19174 return (u32X & u32Y) ^ (~u32X & u32Z);
19175}
19176
19177DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
19178{
19179 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
19180}
19181
19182DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
19183{
19184 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
19185}
19186
19187DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
19188{
19189 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
19190}
19191
19192IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
19193{
19194 uint32_t au32A[3];
19195 uint32_t au32B[3];
19196 uint32_t au32C[3];
19197 uint32_t au32D[3];
19198 uint32_t au32E[3];
19199 uint32_t au32F[3];
19200 uint32_t au32G[3];
19201 uint32_t au32H[3];
19202 uint32_t au32WK[2];
19203
19204 au32A[0] = puSrc->au32[3];
19205 au32B[0] = puSrc->au32[2];
19206 au32C[0] = puDst->au32[3];
19207 au32D[0] = puDst->au32[2];
19208 au32E[0] = puSrc->au32[1];
19209 au32F[0] = puSrc->au32[0];
19210 au32G[0] = puDst->au32[1];
19211 au32H[0] = puDst->au32[0];
19212
19213 au32WK[0] = puXmm0Constants->au32[0];
19214 au32WK[1] = puXmm0Constants->au32[1];
19215
19216 for (uint32_t i = 0; i < 2; i++)
19217 {
19218 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19219 + iemAImpl_sha256_upper_sigma1(au32E[i])
19220 + au32WK[i]
19221 + au32H[i]
19222 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
19223 + iemAImpl_sha256_upper_sigma0(au32A[i]);
19224 au32B[i + 1] = au32A[i];
19225 au32C[i + 1] = au32B[i];
19226 au32D[i + 1] = au32C[i];
19227 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
19228 + iemAImpl_sha256_upper_sigma1(au32E[i])
19229 + au32WK[i]
19230 + au32H[i]
19231 + au32D[i];
19232 au32F[i + 1] = au32E[i];
19233 au32G[i + 1] = au32F[i];
19234 au32H[i + 1] = au32G[i];
19235 }
19236
19237 puDst->au32[3] = au32A[2];
19238 puDst->au32[2] = au32B[2];
19239 puDst->au32[1] = au32E[2];
19240 puDst->au32[0] = au32F[2];
19241}
19242
19243
19244/**
19245 * ADCX
19246 */
19247#define ADX_EMIT(a_Flag, a_Type, a_Max) \
19248 do \
19249 { \
19250 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
19251 a_Type uTmp = *puDst + uSrc; \
19252 if (uTmp < uSrc) \
19253 *pfEFlags |= (a_Flag); \
19254 else \
19255 *pfEFlags &= ~(a_Flag); \
19256 if ( uTmp == a_Max \
19257 && f) \
19258 *pfEFlags |= (a_Flag); \
19259 if (f) \
19260 uTmp++; \
19261 *puDst = uTmp; \
19262 } \
19263 while (0)
19264
19265IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19266{
19267 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19268}
19269
19270IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19271{
19272 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19273}
19274
19275# if defined(IEM_WITHOUT_ASSEMBLY)
19276
19277IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19278{
19279 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
19280}
19281
19282IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19283{
19284 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
19285}
19286
19287#endif
19288
19289
19290/**
19291 * ADOX
19292 */
19293IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19294{
19295 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19296}
19297
19298IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19299{
19300 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19301}
19302
19303# if defined(IEM_WITHOUT_ASSEMBLY)
19304
19305IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
19306{
19307 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
19308}
19309
19310IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
19311{
19312 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
19313}
19314
19315# endif
19316
19317
19318/**
19319 * MPSADBW
19320 */
19321IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
19322{
19323 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19324 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19325 int16_t ai16Src1[11];
19326 int16_t ai16Src2[4];
19327
19328 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19329 ai16Src1[i] = puDst->au8[idxSrc1 + i];
19330
19331 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19332 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
19333
19334 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19335 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19336 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19337 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19338 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19339}
19340
19341
19342IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
19343{
19344 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
19345 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
19346 int16_t ai16Src1[11];
19347 int16_t ai16Src2[4];
19348
19349 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
19350 ai16Src1[i] = puSrc1->au8[idxSrc1 + i];
19351
19352 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
19353 ai16Src2[i] = puSrc2->au8[idxSrc2 + i];
19354
19355 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
19356 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
19357 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
19358 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
19359 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
19360}
19361
19362
19363IEM_DECL_IMPL_DEF(void, iemAImpl_vmpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
19364{
19365 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
19366 RTUINT256U const uSrc2 = *puSrc2;
19367 ASMCompilerBarrier();
19368 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
19369 iemAImpl_vmpsadbw_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil >> 3);
19370}
19371
19372
19373/**
19374 * VPERM2I128
19375 */
19376IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2i128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19377{
19378 if (bImm & RT_BIT(3))
19379 {
19380 puDst->au64[0] = 0;
19381 puDst->au64[1] = 0;
19382 }
19383 else
19384 {
19385 switch (bImm & 0x3)
19386 {
19387 case 0:
19388 puDst->au64[0] = puSrc1->au64[0];
19389 puDst->au64[1] = puSrc1->au64[1];
19390 break;
19391 case 1:
19392 puDst->au64[0] = puSrc1->au64[2];
19393 puDst->au64[1] = puSrc1->au64[3];
19394 break;
19395 case 2:
19396 puDst->au64[0] = puSrc2->au64[0];
19397 puDst->au64[1] = puSrc2->au64[1];
19398 break;
19399 case 3:
19400 puDst->au64[0] = puSrc2->au64[2];
19401 puDst->au64[1] = puSrc2->au64[3];
19402 break;
19403 }
19404 }
19405
19406 if (bImm & RT_BIT(7))
19407 {
19408 puDst->au64[2] = 0;
19409 puDst->au64[3] = 0;
19410 }
19411 else
19412 {
19413 switch ((bImm >> 4) & 0x3)
19414 {
19415 case 0:
19416 puDst->au64[2] = puSrc1->au64[0];
19417 puDst->au64[3] = puSrc1->au64[1];
19418 break;
19419 case 1:
19420 puDst->au64[2] = puSrc1->au64[2];
19421 puDst->au64[3] = puSrc1->au64[3];
19422 break;
19423 case 2:
19424 puDst->au64[2] = puSrc2->au64[0];
19425 puDst->au64[3] = puSrc2->au64[1];
19426 break;
19427 case 3:
19428 puDst->au64[2] = puSrc2->au64[2];
19429 puDst->au64[3] = puSrc2->au64[3];
19430 break;
19431 }
19432 }
19433}
19434
19435
19436/**
19437 * VPERM2F128
19438 */
19439IEM_DECL_IMPL_DEF(void, iemAImpl_vperm2f128_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bImm))
19440{
19441 iemAImpl_vperm2i128_u256_fallback(puDst, puSrc1, puSrc2, bImm);
19442}
19443
19444
19445/**
19446 * DPPS
19447 */
19448IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dpps_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19449{
19450 RT_NOREF(puDst, pSrc, bImm);
19451 AssertReleaseFailed();
19452 return uMxCsrIn;
19453}
19454
19455
19456/**
19457 * DPPD
19458 */
19459IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_dppd_u128_fallback,(uint32_t uMxCsrIn, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
19460{
19461 RT_NOREF(puDst, pSrc, bImm);
19462 AssertReleaseFailed();
19463 return uMxCsrIn;
19464}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette