VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 96014

Last change on this file since 96014 was 96010, checked in by vboxsync, 2 years ago

VMM/IEM: Implement [v]pmaxs{b,w,d} instructions, bugref:9898

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 442.5 KB
Line 
1/* $Id: IEMAllAImplC.cpp 96010 2022-08-03 20:15:46Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28#include <iprt/crc.h>
29
30RT_C_DECLS_BEGIN
31#include <softfloat.h>
32RT_C_DECLS_END
33
34
35/*********************************************************************************************************************************
36* Defined Constants And Macros *
37*********************************************************************************************************************************/
38/** @def IEM_WITHOUT_ASSEMBLY
39 * Enables all the code in this file.
40 */
41#if !defined(IEM_WITHOUT_ASSEMBLY)
42# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
43# define IEM_WITHOUT_ASSEMBLY
44# endif
45#endif
46/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
47#ifdef IEM_WITH_ASSEMBLY
48# undef IEM_WITHOUT_ASSEMBLY
49#endif
50
51/**
52 * Calculates the signed flag value given a result and it's bit width.
53 *
54 * The signed flag (SF) is a duplication of the most significant bit in the
55 * result.
56 *
57 * @returns X86_EFL_SF or 0.
58 * @param a_uResult Unsigned result value.
59 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
60 */
61#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
62 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
63
64/**
65 * Calculates the zero flag value given a result.
66 *
67 * The zero flag (ZF) indicates whether the result is zero or not.
68 *
69 * @returns X86_EFL_ZF or 0.
70 * @param a_uResult Unsigned result value.
71 */
72#define X86_EFL_CALC_ZF(a_uResult) \
73 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
74
75/**
76 * Extracts the OF flag from a OF calculation result.
77 *
78 * These are typically used by concating with a bitcount. The problem is that
79 * 8-bit values needs shifting in the other direction than the others.
80 */
81#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
85
86/**
87 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
88 *
89 * @returns Status bits.
90 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
91 * @param a_uResult Unsigned result value.
92 * @param a_uSrc The source value (for AF calc).
93 * @param a_uDst The original destination value (for AF calc).
94 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
95 * @param a_CfExpr Bool expression for the carry flag (CF).
96 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
97 */
98#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
99 do { \
100 uint32_t fEflTmp = *(a_pfEFlags); \
101 fEflTmp &= ~X86_EFL_STATUS_BITS; \
102 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
103 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
104 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
105 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
106 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
107 \
108 /* Overflow during ADDition happens when both inputs have the same signed \
109 bit value and the result has a different sign bit value. \
110 \
111 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
112 follows that for SUBtraction the signed bit value must differ between \
113 the two inputs and the result's signed bit diff from the first input. \
114 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
115 \
116 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
117 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
118 & RT_BIT_64(a_cBitsWidth - 1)) \
119 & ((a_uResult) ^ (a_uDst)) ); \
120 *(a_pfEFlags) = fEflTmp; \
121 } while (0)
122
123/**
124 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
125 *
126 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
127 * undefined. We do not set AF, as that seems to make the most sense (which
128 * probably makes it the most wrong in real life).
129 *
130 * @returns Status bits.
131 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
132 * @param a_uResult Unsigned result value.
133 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
134 * @param a_fExtra Additional bits to set.
135 */
136#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
137 do { \
138 uint32_t fEflTmp = *(a_pfEFlags); \
139 fEflTmp &= ~X86_EFL_STATUS_BITS; \
140 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
141 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
142 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
143 fEflTmp |= (a_fExtra); \
144 *(a_pfEFlags) = fEflTmp; \
145 } while (0)
146
147
148/*********************************************************************************************************************************
149* Global Variables *
150*********************************************************************************************************************************/
151/**
152 * Parity calculation table.
153 *
154 * This is also used by iemAllAImpl.asm.
155 *
156 * The generator code:
157 * @code
158 * #include <stdio.h>
159 *
160 * int main()
161 * {
162 * unsigned b;
163 * for (b = 0; b < 256; b++)
164 * {
165 * int cOnes = ( b & 1)
166 * + ((b >> 1) & 1)
167 * + ((b >> 2) & 1)
168 * + ((b >> 3) & 1)
169 * + ((b >> 4) & 1)
170 * + ((b >> 5) & 1)
171 * + ((b >> 6) & 1)
172 * + ((b >> 7) & 1);
173 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
174 * b,
175 * (b >> 7) & 1,
176 * (b >> 6) & 1,
177 * (b >> 5) & 1,
178 * (b >> 4) & 1,
179 * (b >> 3) & 1,
180 * (b >> 2) & 1,
181 * (b >> 1) & 1,
182 * b & 1,
183 * cOnes & 1 ? "0" : "X86_EFL_PF");
184 * }
185 * return 0;
186 * }
187 * @endcode
188 */
189uint8_t const g_afParity[256] =
190{
191 /* 0000 = 00000000b */ X86_EFL_PF,
192 /* 0x01 = 00000001b */ 0,
193 /* 0x02 = 00000010b */ 0,
194 /* 0x03 = 00000011b */ X86_EFL_PF,
195 /* 0x04 = 00000100b */ 0,
196 /* 0x05 = 00000101b */ X86_EFL_PF,
197 /* 0x06 = 00000110b */ X86_EFL_PF,
198 /* 0x07 = 00000111b */ 0,
199 /* 0x08 = 00001000b */ 0,
200 /* 0x09 = 00001001b */ X86_EFL_PF,
201 /* 0x0a = 00001010b */ X86_EFL_PF,
202 /* 0x0b = 00001011b */ 0,
203 /* 0x0c = 00001100b */ X86_EFL_PF,
204 /* 0x0d = 00001101b */ 0,
205 /* 0x0e = 00001110b */ 0,
206 /* 0x0f = 00001111b */ X86_EFL_PF,
207 /* 0x10 = 00010000b */ 0,
208 /* 0x11 = 00010001b */ X86_EFL_PF,
209 /* 0x12 = 00010010b */ X86_EFL_PF,
210 /* 0x13 = 00010011b */ 0,
211 /* 0x14 = 00010100b */ X86_EFL_PF,
212 /* 0x15 = 00010101b */ 0,
213 /* 0x16 = 00010110b */ 0,
214 /* 0x17 = 00010111b */ X86_EFL_PF,
215 /* 0x18 = 00011000b */ X86_EFL_PF,
216 /* 0x19 = 00011001b */ 0,
217 /* 0x1a = 00011010b */ 0,
218 /* 0x1b = 00011011b */ X86_EFL_PF,
219 /* 0x1c = 00011100b */ 0,
220 /* 0x1d = 00011101b */ X86_EFL_PF,
221 /* 0x1e = 00011110b */ X86_EFL_PF,
222 /* 0x1f = 00011111b */ 0,
223 /* 0x20 = 00100000b */ 0,
224 /* 0x21 = 00100001b */ X86_EFL_PF,
225 /* 0x22 = 00100010b */ X86_EFL_PF,
226 /* 0x23 = 00100011b */ 0,
227 /* 0x24 = 00100100b */ X86_EFL_PF,
228 /* 0x25 = 00100101b */ 0,
229 /* 0x26 = 00100110b */ 0,
230 /* 0x27 = 00100111b */ X86_EFL_PF,
231 /* 0x28 = 00101000b */ X86_EFL_PF,
232 /* 0x29 = 00101001b */ 0,
233 /* 0x2a = 00101010b */ 0,
234 /* 0x2b = 00101011b */ X86_EFL_PF,
235 /* 0x2c = 00101100b */ 0,
236 /* 0x2d = 00101101b */ X86_EFL_PF,
237 /* 0x2e = 00101110b */ X86_EFL_PF,
238 /* 0x2f = 00101111b */ 0,
239 /* 0x30 = 00110000b */ X86_EFL_PF,
240 /* 0x31 = 00110001b */ 0,
241 /* 0x32 = 00110010b */ 0,
242 /* 0x33 = 00110011b */ X86_EFL_PF,
243 /* 0x34 = 00110100b */ 0,
244 /* 0x35 = 00110101b */ X86_EFL_PF,
245 /* 0x36 = 00110110b */ X86_EFL_PF,
246 /* 0x37 = 00110111b */ 0,
247 /* 0x38 = 00111000b */ 0,
248 /* 0x39 = 00111001b */ X86_EFL_PF,
249 /* 0x3a = 00111010b */ X86_EFL_PF,
250 /* 0x3b = 00111011b */ 0,
251 /* 0x3c = 00111100b */ X86_EFL_PF,
252 /* 0x3d = 00111101b */ 0,
253 /* 0x3e = 00111110b */ 0,
254 /* 0x3f = 00111111b */ X86_EFL_PF,
255 /* 0x40 = 01000000b */ 0,
256 /* 0x41 = 01000001b */ X86_EFL_PF,
257 /* 0x42 = 01000010b */ X86_EFL_PF,
258 /* 0x43 = 01000011b */ 0,
259 /* 0x44 = 01000100b */ X86_EFL_PF,
260 /* 0x45 = 01000101b */ 0,
261 /* 0x46 = 01000110b */ 0,
262 /* 0x47 = 01000111b */ X86_EFL_PF,
263 /* 0x48 = 01001000b */ X86_EFL_PF,
264 /* 0x49 = 01001001b */ 0,
265 /* 0x4a = 01001010b */ 0,
266 /* 0x4b = 01001011b */ X86_EFL_PF,
267 /* 0x4c = 01001100b */ 0,
268 /* 0x4d = 01001101b */ X86_EFL_PF,
269 /* 0x4e = 01001110b */ X86_EFL_PF,
270 /* 0x4f = 01001111b */ 0,
271 /* 0x50 = 01010000b */ X86_EFL_PF,
272 /* 0x51 = 01010001b */ 0,
273 /* 0x52 = 01010010b */ 0,
274 /* 0x53 = 01010011b */ X86_EFL_PF,
275 /* 0x54 = 01010100b */ 0,
276 /* 0x55 = 01010101b */ X86_EFL_PF,
277 /* 0x56 = 01010110b */ X86_EFL_PF,
278 /* 0x57 = 01010111b */ 0,
279 /* 0x58 = 01011000b */ 0,
280 /* 0x59 = 01011001b */ X86_EFL_PF,
281 /* 0x5a = 01011010b */ X86_EFL_PF,
282 /* 0x5b = 01011011b */ 0,
283 /* 0x5c = 01011100b */ X86_EFL_PF,
284 /* 0x5d = 01011101b */ 0,
285 /* 0x5e = 01011110b */ 0,
286 /* 0x5f = 01011111b */ X86_EFL_PF,
287 /* 0x60 = 01100000b */ X86_EFL_PF,
288 /* 0x61 = 01100001b */ 0,
289 /* 0x62 = 01100010b */ 0,
290 /* 0x63 = 01100011b */ X86_EFL_PF,
291 /* 0x64 = 01100100b */ 0,
292 /* 0x65 = 01100101b */ X86_EFL_PF,
293 /* 0x66 = 01100110b */ X86_EFL_PF,
294 /* 0x67 = 01100111b */ 0,
295 /* 0x68 = 01101000b */ 0,
296 /* 0x69 = 01101001b */ X86_EFL_PF,
297 /* 0x6a = 01101010b */ X86_EFL_PF,
298 /* 0x6b = 01101011b */ 0,
299 /* 0x6c = 01101100b */ X86_EFL_PF,
300 /* 0x6d = 01101101b */ 0,
301 /* 0x6e = 01101110b */ 0,
302 /* 0x6f = 01101111b */ X86_EFL_PF,
303 /* 0x70 = 01110000b */ 0,
304 /* 0x71 = 01110001b */ X86_EFL_PF,
305 /* 0x72 = 01110010b */ X86_EFL_PF,
306 /* 0x73 = 01110011b */ 0,
307 /* 0x74 = 01110100b */ X86_EFL_PF,
308 /* 0x75 = 01110101b */ 0,
309 /* 0x76 = 01110110b */ 0,
310 /* 0x77 = 01110111b */ X86_EFL_PF,
311 /* 0x78 = 01111000b */ X86_EFL_PF,
312 /* 0x79 = 01111001b */ 0,
313 /* 0x7a = 01111010b */ 0,
314 /* 0x7b = 01111011b */ X86_EFL_PF,
315 /* 0x7c = 01111100b */ 0,
316 /* 0x7d = 01111101b */ X86_EFL_PF,
317 /* 0x7e = 01111110b */ X86_EFL_PF,
318 /* 0x7f = 01111111b */ 0,
319 /* 0x80 = 10000000b */ 0,
320 /* 0x81 = 10000001b */ X86_EFL_PF,
321 /* 0x82 = 10000010b */ X86_EFL_PF,
322 /* 0x83 = 10000011b */ 0,
323 /* 0x84 = 10000100b */ X86_EFL_PF,
324 /* 0x85 = 10000101b */ 0,
325 /* 0x86 = 10000110b */ 0,
326 /* 0x87 = 10000111b */ X86_EFL_PF,
327 /* 0x88 = 10001000b */ X86_EFL_PF,
328 /* 0x89 = 10001001b */ 0,
329 /* 0x8a = 10001010b */ 0,
330 /* 0x8b = 10001011b */ X86_EFL_PF,
331 /* 0x8c = 10001100b */ 0,
332 /* 0x8d = 10001101b */ X86_EFL_PF,
333 /* 0x8e = 10001110b */ X86_EFL_PF,
334 /* 0x8f = 10001111b */ 0,
335 /* 0x90 = 10010000b */ X86_EFL_PF,
336 /* 0x91 = 10010001b */ 0,
337 /* 0x92 = 10010010b */ 0,
338 /* 0x93 = 10010011b */ X86_EFL_PF,
339 /* 0x94 = 10010100b */ 0,
340 /* 0x95 = 10010101b */ X86_EFL_PF,
341 /* 0x96 = 10010110b */ X86_EFL_PF,
342 /* 0x97 = 10010111b */ 0,
343 /* 0x98 = 10011000b */ 0,
344 /* 0x99 = 10011001b */ X86_EFL_PF,
345 /* 0x9a = 10011010b */ X86_EFL_PF,
346 /* 0x9b = 10011011b */ 0,
347 /* 0x9c = 10011100b */ X86_EFL_PF,
348 /* 0x9d = 10011101b */ 0,
349 /* 0x9e = 10011110b */ 0,
350 /* 0x9f = 10011111b */ X86_EFL_PF,
351 /* 0xa0 = 10100000b */ X86_EFL_PF,
352 /* 0xa1 = 10100001b */ 0,
353 /* 0xa2 = 10100010b */ 0,
354 /* 0xa3 = 10100011b */ X86_EFL_PF,
355 /* 0xa4 = 10100100b */ 0,
356 /* 0xa5 = 10100101b */ X86_EFL_PF,
357 /* 0xa6 = 10100110b */ X86_EFL_PF,
358 /* 0xa7 = 10100111b */ 0,
359 /* 0xa8 = 10101000b */ 0,
360 /* 0xa9 = 10101001b */ X86_EFL_PF,
361 /* 0xaa = 10101010b */ X86_EFL_PF,
362 /* 0xab = 10101011b */ 0,
363 /* 0xac = 10101100b */ X86_EFL_PF,
364 /* 0xad = 10101101b */ 0,
365 /* 0xae = 10101110b */ 0,
366 /* 0xaf = 10101111b */ X86_EFL_PF,
367 /* 0xb0 = 10110000b */ 0,
368 /* 0xb1 = 10110001b */ X86_EFL_PF,
369 /* 0xb2 = 10110010b */ X86_EFL_PF,
370 /* 0xb3 = 10110011b */ 0,
371 /* 0xb4 = 10110100b */ X86_EFL_PF,
372 /* 0xb5 = 10110101b */ 0,
373 /* 0xb6 = 10110110b */ 0,
374 /* 0xb7 = 10110111b */ X86_EFL_PF,
375 /* 0xb8 = 10111000b */ X86_EFL_PF,
376 /* 0xb9 = 10111001b */ 0,
377 /* 0xba = 10111010b */ 0,
378 /* 0xbb = 10111011b */ X86_EFL_PF,
379 /* 0xbc = 10111100b */ 0,
380 /* 0xbd = 10111101b */ X86_EFL_PF,
381 /* 0xbe = 10111110b */ X86_EFL_PF,
382 /* 0xbf = 10111111b */ 0,
383 /* 0xc0 = 11000000b */ X86_EFL_PF,
384 /* 0xc1 = 11000001b */ 0,
385 /* 0xc2 = 11000010b */ 0,
386 /* 0xc3 = 11000011b */ X86_EFL_PF,
387 /* 0xc4 = 11000100b */ 0,
388 /* 0xc5 = 11000101b */ X86_EFL_PF,
389 /* 0xc6 = 11000110b */ X86_EFL_PF,
390 /* 0xc7 = 11000111b */ 0,
391 /* 0xc8 = 11001000b */ 0,
392 /* 0xc9 = 11001001b */ X86_EFL_PF,
393 /* 0xca = 11001010b */ X86_EFL_PF,
394 /* 0xcb = 11001011b */ 0,
395 /* 0xcc = 11001100b */ X86_EFL_PF,
396 /* 0xcd = 11001101b */ 0,
397 /* 0xce = 11001110b */ 0,
398 /* 0xcf = 11001111b */ X86_EFL_PF,
399 /* 0xd0 = 11010000b */ 0,
400 /* 0xd1 = 11010001b */ X86_EFL_PF,
401 /* 0xd2 = 11010010b */ X86_EFL_PF,
402 /* 0xd3 = 11010011b */ 0,
403 /* 0xd4 = 11010100b */ X86_EFL_PF,
404 /* 0xd5 = 11010101b */ 0,
405 /* 0xd6 = 11010110b */ 0,
406 /* 0xd7 = 11010111b */ X86_EFL_PF,
407 /* 0xd8 = 11011000b */ X86_EFL_PF,
408 /* 0xd9 = 11011001b */ 0,
409 /* 0xda = 11011010b */ 0,
410 /* 0xdb = 11011011b */ X86_EFL_PF,
411 /* 0xdc = 11011100b */ 0,
412 /* 0xdd = 11011101b */ X86_EFL_PF,
413 /* 0xde = 11011110b */ X86_EFL_PF,
414 /* 0xdf = 11011111b */ 0,
415 /* 0xe0 = 11100000b */ 0,
416 /* 0xe1 = 11100001b */ X86_EFL_PF,
417 /* 0xe2 = 11100010b */ X86_EFL_PF,
418 /* 0xe3 = 11100011b */ 0,
419 /* 0xe4 = 11100100b */ X86_EFL_PF,
420 /* 0xe5 = 11100101b */ 0,
421 /* 0xe6 = 11100110b */ 0,
422 /* 0xe7 = 11100111b */ X86_EFL_PF,
423 /* 0xe8 = 11101000b */ X86_EFL_PF,
424 /* 0xe9 = 11101001b */ 0,
425 /* 0xea = 11101010b */ 0,
426 /* 0xeb = 11101011b */ X86_EFL_PF,
427 /* 0xec = 11101100b */ 0,
428 /* 0xed = 11101101b */ X86_EFL_PF,
429 /* 0xee = 11101110b */ X86_EFL_PF,
430 /* 0xef = 11101111b */ 0,
431 /* 0xf0 = 11110000b */ X86_EFL_PF,
432 /* 0xf1 = 11110001b */ 0,
433 /* 0xf2 = 11110010b */ 0,
434 /* 0xf3 = 11110011b */ X86_EFL_PF,
435 /* 0xf4 = 11110100b */ 0,
436 /* 0xf5 = 11110101b */ X86_EFL_PF,
437 /* 0xf6 = 11110110b */ X86_EFL_PF,
438 /* 0xf7 = 11110111b */ 0,
439 /* 0xf8 = 11111000b */ 0,
440 /* 0xf9 = 11111001b */ X86_EFL_PF,
441 /* 0xfa = 11111010b */ X86_EFL_PF,
442 /* 0xfb = 11111011b */ 0,
443 /* 0xfc = 11111100b */ X86_EFL_PF,
444 /* 0xfd = 11111101b */ 0,
445 /* 0xfe = 11111110b */ 0,
446 /* 0xff = 11111111b */ X86_EFL_PF,
447};
448
449/* for clang: */
450extern const RTFLOAT80U g_ar80Zero[];
451extern const RTFLOAT80U g_ar80One[];
452extern const RTFLOAT80U g_r80Indefinite;
453extern const RTFLOAT80U g_ar80Infinity[];
454extern const RTFLOAT128U g_r128Ln2;
455extern const RTUINT128U g_u128Ln2Mantissa;
456extern const RTUINT128U g_u128Ln2MantissaIntel;
457extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
458
459/** Zero values (indexed by fSign). */
460RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
461
462/** One values (indexed by fSign). */
463RTFLOAT80U const g_ar80One[] =
464{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
465
466/** Indefinite (negative). */
467RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
468
469/** Infinities (indexed by fSign). */
470RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
471
472#if 0
473/** 128-bit floating point constant: 2.0 */
474const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
475#endif
476
477
478/* The next section is generated by tools/IEMGenFpuConstants: */
479
480/** The ln2 constant as 128-bit floating point value.
481 * base-10: 6.93147180559945309417232121458176575e-1
482 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
483 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
484//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
485const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
486/** High precision ln2 value.
487 * base-10: 6.931471805599453094172321214581765680747e-1
488 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
489 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
490const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
491/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
492 * base-10: 6.931471805599453094151379470289064954613e-1
493 * base-16: b.17217f7d1cf79abc0000000000000000@-1
494 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
495const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
496
497/** Horner constants for f2xm1 */
498const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
499{
500 /* a0
501 * base-10: 1.00000000000000000000000000000000000e0
502 * base-16: 1.0000000000000000000000000000@0
503 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
504 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
505 /* a1
506 * base-10: 5.00000000000000000000000000000000000e-1
507 * base-16: 8.0000000000000000000000000000@-1
508 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
509 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
510 /* a2
511 * base-10: 1.66666666666666666666666666666666658e-1
512 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
513 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
514 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
515 /* a3
516 * base-10: 4.16666666666666666666666666666666646e-2
517 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
518 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
519 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
520 /* a4
521 * base-10: 8.33333333333333333333333333333333323e-3
522 * base-16: 2.2222222222222222222222222222@-2
523 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
524 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
525 /* a5
526 * base-10: 1.38888888888888888888888888888888874e-3
527 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
528 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
529 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
530 /* a6
531 * base-10: 1.98412698412698412698412698412698412e-4
532 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
533 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
534 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
535 /* a7
536 * base-10: 2.48015873015873015873015873015873015e-5
537 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
538 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
539 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
540 /* a8
541 * base-10: 2.75573192239858906525573192239858902e-6
542 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
543 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
544 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
545 /* a9
546 * base-10: 2.75573192239858906525573192239858865e-7
547 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
548 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
549 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
550 /* a10
551 * base-10: 2.50521083854417187750521083854417184e-8
552 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
553 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
554 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
555 /* a11
556 * base-10: 2.08767569878680989792100903212014296e-9
557 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
558 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
559 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
560 /* a12
561 * base-10: 1.60590438368216145993923771701549472e-10
562 * base-16: b.092309d43684be51c198e91d7b40@-9
563 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
564 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
565 /* a13
566 * base-10: 1.14707455977297247138516979786821043e-11
567 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
568 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
569 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
570 /* a14
571 * base-10: 7.64716373181981647590113198578806964e-13
572 * base-16: d.73f9f399dc0f88ec32b587746578@-11
573 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
574 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
575 /* a15
576 * base-10: 4.77947733238738529743820749111754352e-14
577 * base-16: d.73f9f399dc0f88ec32b587746578@-12
578 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
579 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
580 /* a16
581 * base-10: 2.81145725434552076319894558301031970e-15
582 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
583 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
584 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
585 /* a17
586 * base-10: 1.56192069685862264622163643500573321e-16
587 * base-16: b.413c31dcbecbbdd8024435161550@-14
588 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
589 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
590 /* a18
591 * base-10: 8.22063524662432971695598123687227980e-18
592 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
593 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
594 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
595 /* a19
596 * base-10: 4.11031762331216485847799061843614006e-19
597 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
598 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
599 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
600 /* a20
601 * base-10: 7.04351638180413298434020229233492164e-20
602 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
603 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
604 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
605 /* a21
606 * base-10: 5.81527769640186708776361513365257702e-20
607 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
608 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
609 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
610};
611
612
613/*
614 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
615 * it all in C is probably safer atm., optimize what's necessary later, maybe.
616 */
617#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
618
619
620/*********************************************************************************************************************************
621* Binary Operations *
622*********************************************************************************************************************************/
623
624/*
625 * ADD
626 */
627
628IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
629{
630 uint64_t uDst = *puDst;
631 uint64_t uResult = uDst + uSrc;
632 *puDst = uResult;
633 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
634}
635
636# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
637
638IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
639{
640 uint32_t uDst = *puDst;
641 uint32_t uResult = uDst + uSrc;
642 *puDst = uResult;
643 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
644}
645
646
647IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
648{
649 uint16_t uDst = *puDst;
650 uint16_t uResult = uDst + uSrc;
651 *puDst = uResult;
652 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
653}
654
655
656IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
657{
658 uint8_t uDst = *puDst;
659 uint8_t uResult = uDst + uSrc;
660 *puDst = uResult;
661 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
662}
663
664# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
665
666/*
667 * ADC
668 */
669
670IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
671{
672 if (!(*pfEFlags & X86_EFL_CF))
673 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
674 else
675 {
676 uint64_t uDst = *puDst;
677 uint64_t uResult = uDst + uSrc + 1;
678 *puDst = uResult;
679 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
680 }
681}
682
683# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
684
685IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
686{
687 if (!(*pfEFlags & X86_EFL_CF))
688 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
689 else
690 {
691 uint32_t uDst = *puDst;
692 uint32_t uResult = uDst + uSrc + 1;
693 *puDst = uResult;
694 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
695 }
696}
697
698
699IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
700{
701 if (!(*pfEFlags & X86_EFL_CF))
702 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
703 else
704 {
705 uint16_t uDst = *puDst;
706 uint16_t uResult = uDst + uSrc + 1;
707 *puDst = uResult;
708 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
709 }
710}
711
712
713IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
714{
715 if (!(*pfEFlags & X86_EFL_CF))
716 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
717 else
718 {
719 uint8_t uDst = *puDst;
720 uint8_t uResult = uDst + uSrc + 1;
721 *puDst = uResult;
722 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
723 }
724}
725
726# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
727
728/*
729 * SUB
730 */
731
732IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
733{
734 uint64_t uDst = *puDst;
735 uint64_t uResult = uDst - uSrc;
736 *puDst = uResult;
737 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
738}
739
740# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
741
742IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
743{
744 uint32_t uDst = *puDst;
745 uint32_t uResult = uDst - uSrc;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
748}
749
750
751IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
752{
753 uint16_t uDst = *puDst;
754 uint16_t uResult = uDst - uSrc;
755 *puDst = uResult;
756 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
757}
758
759
760IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
761{
762 uint8_t uDst = *puDst;
763 uint8_t uResult = uDst - uSrc;
764 *puDst = uResult;
765 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
766}
767
768# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
769
770/*
771 * SBB
772 */
773
774IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
775{
776 if (!(*pfEFlags & X86_EFL_CF))
777 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
778 else
779 {
780 uint64_t uDst = *puDst;
781 uint64_t uResult = uDst - uSrc - 1;
782 *puDst = uResult;
783 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
784 }
785}
786
787# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
788
789IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
790{
791 if (!(*pfEFlags & X86_EFL_CF))
792 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
793 else
794 {
795 uint32_t uDst = *puDst;
796 uint32_t uResult = uDst - uSrc - 1;
797 *puDst = uResult;
798 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
799 }
800}
801
802
803IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
804{
805 if (!(*pfEFlags & X86_EFL_CF))
806 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
807 else
808 {
809 uint16_t uDst = *puDst;
810 uint16_t uResult = uDst - uSrc - 1;
811 *puDst = uResult;
812 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
813 }
814}
815
816
817IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
818{
819 if (!(*pfEFlags & X86_EFL_CF))
820 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
821 else
822 {
823 uint8_t uDst = *puDst;
824 uint8_t uResult = uDst - uSrc - 1;
825 *puDst = uResult;
826 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
827 }
828}
829
830# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
831
832
833/*
834 * OR
835 */
836
837IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
838{
839 uint64_t uResult = *puDst | uSrc;
840 *puDst = uResult;
841 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
842}
843
844# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
845
846IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
847{
848 uint32_t uResult = *puDst | uSrc;
849 *puDst = uResult;
850 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
851}
852
853
854IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
855{
856 uint16_t uResult = *puDst | uSrc;
857 *puDst = uResult;
858 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
859}
860
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
863{
864 uint8_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
867}
868
869# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
870
871/*
872 * XOR
873 */
874
875IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
876{
877 uint64_t uResult = *puDst ^ uSrc;
878 *puDst = uResult;
879 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
880}
881
882# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
883
884IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
885{
886 uint32_t uResult = *puDst ^ uSrc;
887 *puDst = uResult;
888 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
889}
890
891
892IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
893{
894 uint16_t uResult = *puDst ^ uSrc;
895 *puDst = uResult;
896 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
897}
898
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
901{
902 uint8_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
905}
906
907# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
908
909/*
910 * AND
911 */
912
913IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
914{
915 uint64_t const uResult = *puDst & uSrc;
916 *puDst = uResult;
917 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
918}
919
920# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
921
922IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
923{
924 uint32_t const uResult = *puDst & uSrc;
925 *puDst = uResult;
926 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
927}
928
929
930IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
931{
932 uint16_t const uResult = *puDst & uSrc;
933 *puDst = uResult;
934 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
935}
936
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
939{
940 uint8_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
943}
944
945# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
946#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
947
948/*
949 * ANDN (BMI1 instruction)
950 */
951
952IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
953{
954 uint64_t const uResult = ~uSrc1 & uSrc2;
955 *puDst = uResult;
956 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
957}
958
959
960IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
961{
962 uint32_t const uResult = ~uSrc1 & uSrc2;
963 *puDst = uResult;
964 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
965}
966
967
968#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
969IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
970{
971 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
972}
973#endif
974
975
976#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
978{
979 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
980}
981#endif
982
983#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
984
985/*
986 * CMP
987 */
988
989IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
990{
991 uint64_t uDstTmp = *puDst;
992 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
993}
994
995# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
996
997IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
998{
999 uint32_t uDstTmp = *puDst;
1000 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1001}
1002
1003
1004IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1005{
1006 uint16_t uDstTmp = *puDst;
1007 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1008}
1009
1010
1011IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1012{
1013 uint8_t uDstTmp = *puDst;
1014 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1015}
1016
1017# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1018
1019/*
1020 * TEST
1021 */
1022
1023IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1024{
1025 uint64_t uResult = *puDst & uSrc;
1026 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1027}
1028
1029# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1030
1031IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1032{
1033 uint32_t uResult = *puDst & uSrc;
1034 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1035}
1036
1037
1038IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1039{
1040 uint16_t uResult = *puDst & uSrc;
1041 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1042}
1043
1044
1045IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1046{
1047 uint8_t uResult = *puDst & uSrc;
1048 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1049}
1050
1051# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1052
1053
1054/*
1055 * LOCK prefixed variants of the above
1056 */
1057
1058/** 64-bit locked binary operand operation. */
1059# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1060 do { \
1061 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1062 uint ## a_cBitsWidth ## _t uTmp; \
1063 uint32_t fEflTmp; \
1064 do \
1065 { \
1066 uTmp = uOld; \
1067 fEflTmp = *pfEFlags; \
1068 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1069 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1070 *pfEFlags = fEflTmp; \
1071 } while (0)
1072
1073
1074#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1075 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1076 uint ## a_cBitsWidth ## _t uSrc, \
1077 uint32_t *pfEFlags)) \
1078 { \
1079 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1080 }
1081
1082EMIT_LOCKED_BIN_OP(add, 64)
1083EMIT_LOCKED_BIN_OP(adc, 64)
1084EMIT_LOCKED_BIN_OP(sub, 64)
1085EMIT_LOCKED_BIN_OP(sbb, 64)
1086EMIT_LOCKED_BIN_OP(or, 64)
1087EMIT_LOCKED_BIN_OP(xor, 64)
1088EMIT_LOCKED_BIN_OP(and, 64)
1089# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1090EMIT_LOCKED_BIN_OP(add, 32)
1091EMIT_LOCKED_BIN_OP(adc, 32)
1092EMIT_LOCKED_BIN_OP(sub, 32)
1093EMIT_LOCKED_BIN_OP(sbb, 32)
1094EMIT_LOCKED_BIN_OP(or, 32)
1095EMIT_LOCKED_BIN_OP(xor, 32)
1096EMIT_LOCKED_BIN_OP(and, 32)
1097
1098EMIT_LOCKED_BIN_OP(add, 16)
1099EMIT_LOCKED_BIN_OP(adc, 16)
1100EMIT_LOCKED_BIN_OP(sub, 16)
1101EMIT_LOCKED_BIN_OP(sbb, 16)
1102EMIT_LOCKED_BIN_OP(or, 16)
1103EMIT_LOCKED_BIN_OP(xor, 16)
1104EMIT_LOCKED_BIN_OP(and, 16)
1105
1106EMIT_LOCKED_BIN_OP(add, 8)
1107EMIT_LOCKED_BIN_OP(adc, 8)
1108EMIT_LOCKED_BIN_OP(sub, 8)
1109EMIT_LOCKED_BIN_OP(sbb, 8)
1110EMIT_LOCKED_BIN_OP(or, 8)
1111EMIT_LOCKED_BIN_OP(xor, 8)
1112EMIT_LOCKED_BIN_OP(and, 8)
1113# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1114
1115
1116/*
1117 * Bit operations (same signature as above).
1118 */
1119
1120/*
1121 * BT
1122 */
1123
1124IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1125{
1126 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1127 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1128 Assert(uSrc < 64);
1129 uint64_t uDst = *puDst;
1130 if (uDst & RT_BIT_64(uSrc))
1131 *pfEFlags |= X86_EFL_CF;
1132 else
1133 *pfEFlags &= ~X86_EFL_CF;
1134}
1135
1136# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1137
1138IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1139{
1140 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1141 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1142 Assert(uSrc < 32);
1143 uint32_t uDst = *puDst;
1144 if (uDst & RT_BIT_32(uSrc))
1145 *pfEFlags |= X86_EFL_CF;
1146 else
1147 *pfEFlags &= ~X86_EFL_CF;
1148}
1149
1150IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1151{
1152 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1153 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1154 Assert(uSrc < 16);
1155 uint16_t uDst = *puDst;
1156 if (uDst & RT_BIT_32(uSrc))
1157 *pfEFlags |= X86_EFL_CF;
1158 else
1159 *pfEFlags &= ~X86_EFL_CF;
1160}
1161
1162# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1163
1164/*
1165 * BTC
1166 */
1167
1168IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1169{
1170 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1171 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1172 Assert(uSrc < 64);
1173 uint64_t fMask = RT_BIT_64(uSrc);
1174 uint64_t uDst = *puDst;
1175 if (uDst & fMask)
1176 {
1177 uDst &= ~fMask;
1178 *puDst = uDst;
1179 *pfEFlags |= X86_EFL_CF;
1180 }
1181 else
1182 {
1183 uDst |= fMask;
1184 *puDst = uDst;
1185 *pfEFlags &= ~X86_EFL_CF;
1186 }
1187}
1188
1189# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1190
1191IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1192{
1193 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1194 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1195 Assert(uSrc < 32);
1196 uint32_t fMask = RT_BIT_32(uSrc);
1197 uint32_t uDst = *puDst;
1198 if (uDst & fMask)
1199 {
1200 uDst &= ~fMask;
1201 *puDst = uDst;
1202 *pfEFlags |= X86_EFL_CF;
1203 }
1204 else
1205 {
1206 uDst |= fMask;
1207 *puDst = uDst;
1208 *pfEFlags &= ~X86_EFL_CF;
1209 }
1210}
1211
1212
1213IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1214{
1215 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1216 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1217 Assert(uSrc < 16);
1218 uint16_t fMask = RT_BIT_32(uSrc);
1219 uint16_t uDst = *puDst;
1220 if (uDst & fMask)
1221 {
1222 uDst &= ~fMask;
1223 *puDst = uDst;
1224 *pfEFlags |= X86_EFL_CF;
1225 }
1226 else
1227 {
1228 uDst |= fMask;
1229 *puDst = uDst;
1230 *pfEFlags &= ~X86_EFL_CF;
1231 }
1232}
1233
1234# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1235
1236/*
1237 * BTR
1238 */
1239
1240IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1241{
1242 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1243 logical operation (AND/OR/whatever). */
1244 Assert(uSrc < 64);
1245 uint64_t fMask = RT_BIT_64(uSrc);
1246 uint64_t uDst = *puDst;
1247 if (uDst & fMask)
1248 {
1249 uDst &= ~fMask;
1250 *puDst = uDst;
1251 *pfEFlags |= X86_EFL_CF;
1252 }
1253 else
1254 *pfEFlags &= ~X86_EFL_CF;
1255}
1256
1257# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1258
1259IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1260{
1261 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1262 logical operation (AND/OR/whatever). */
1263 Assert(uSrc < 32);
1264 uint32_t fMask = RT_BIT_32(uSrc);
1265 uint32_t uDst = *puDst;
1266 if (uDst & fMask)
1267 {
1268 uDst &= ~fMask;
1269 *puDst = uDst;
1270 *pfEFlags |= X86_EFL_CF;
1271 }
1272 else
1273 *pfEFlags &= ~X86_EFL_CF;
1274}
1275
1276
1277IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1278{
1279 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1280 logical operation (AND/OR/whatever). */
1281 Assert(uSrc < 16);
1282 uint16_t fMask = RT_BIT_32(uSrc);
1283 uint16_t uDst = *puDst;
1284 if (uDst & fMask)
1285 {
1286 uDst &= ~fMask;
1287 *puDst = uDst;
1288 *pfEFlags |= X86_EFL_CF;
1289 }
1290 else
1291 *pfEFlags &= ~X86_EFL_CF;
1292}
1293
1294# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1295
1296/*
1297 * BTS
1298 */
1299
1300IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1301{
1302 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1303 logical operation (AND/OR/whatever). */
1304 Assert(uSrc < 64);
1305 uint64_t fMask = RT_BIT_64(uSrc);
1306 uint64_t uDst = *puDst;
1307 if (uDst & fMask)
1308 *pfEFlags |= X86_EFL_CF;
1309 else
1310 {
1311 uDst |= fMask;
1312 *puDst = uDst;
1313 *pfEFlags &= ~X86_EFL_CF;
1314 }
1315}
1316
1317# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1318
1319IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1320{
1321 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1322 logical operation (AND/OR/whatever). */
1323 Assert(uSrc < 32);
1324 uint32_t fMask = RT_BIT_32(uSrc);
1325 uint32_t uDst = *puDst;
1326 if (uDst & fMask)
1327 *pfEFlags |= X86_EFL_CF;
1328 else
1329 {
1330 uDst |= fMask;
1331 *puDst = uDst;
1332 *pfEFlags &= ~X86_EFL_CF;
1333 }
1334}
1335
1336
1337IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1338{
1339 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1340 logical operation (AND/OR/whatever). */
1341 Assert(uSrc < 16);
1342 uint16_t fMask = RT_BIT_32(uSrc);
1343 uint32_t uDst = *puDst;
1344 if (uDst & fMask)
1345 *pfEFlags |= X86_EFL_CF;
1346 else
1347 {
1348 uDst |= fMask;
1349 *puDst = uDst;
1350 *pfEFlags &= ~X86_EFL_CF;
1351 }
1352}
1353
1354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1355
1356
1357EMIT_LOCKED_BIN_OP(btc, 64)
1358EMIT_LOCKED_BIN_OP(btr, 64)
1359EMIT_LOCKED_BIN_OP(bts, 64)
1360# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1361EMIT_LOCKED_BIN_OP(btc, 32)
1362EMIT_LOCKED_BIN_OP(btr, 32)
1363EMIT_LOCKED_BIN_OP(bts, 32)
1364
1365EMIT_LOCKED_BIN_OP(btc, 16)
1366EMIT_LOCKED_BIN_OP(btr, 16)
1367EMIT_LOCKED_BIN_OP(bts, 16)
1368# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1369
1370
1371/*
1372 * Helpers for BSR and BSF.
1373 *
1374 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1375 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1376 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1377 * but we restrict ourselves to emulating these recent marchs.
1378 */
1379#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1380 unsigned iBit = (a_iBit); \
1381 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1382 if (iBit) \
1383 { \
1384 *puDst = --iBit; \
1385 fEfl |= g_afParity[iBit]; \
1386 } \
1387 else \
1388 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1389 *pfEFlags = fEfl; \
1390 } while (0)
1391#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1392 unsigned const iBit = (a_iBit); \
1393 if (iBit) \
1394 { \
1395 *puDst = iBit - 1; \
1396 *pfEFlags &= ~X86_EFL_ZF; \
1397 } \
1398 else \
1399 *pfEFlags |= X86_EFL_ZF; \
1400 } while (0)
1401
1402
1403/*
1404 * BSF - first (least significant) bit set
1405 */
1406IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1407{
1408 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1409}
1410
1411IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1412{
1413 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1414}
1415
1416IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1417{
1418 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1419}
1420
1421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1422
1423IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1424{
1425 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1426}
1427
1428IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1429{
1430 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1431}
1432
1433IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1434{
1435 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1436}
1437
1438
1439IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1440{
1441 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1442}
1443
1444IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1445{
1446 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1447}
1448
1449IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1450{
1451 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1452}
1453
1454# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1455
1456
1457/*
1458 * BSR - last (most significant) bit set
1459 */
1460IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1461{
1462 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1463}
1464
1465IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1466{
1467 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1468}
1469
1470IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1471{
1472 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1473}
1474
1475# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1476
1477IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1478{
1479 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1480}
1481
1482IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1483{
1484 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1485}
1486
1487IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1488{
1489 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1490}
1491
1492
1493IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1494{
1495 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1496}
1497
1498IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1499{
1500 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1501}
1502
1503IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1504{
1505 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1506}
1507
1508# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1509
1510
1511/*
1512 * Helpers for LZCNT and TZCNT.
1513 */
1514#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1515 unsigned const uResult = (a_uResult); \
1516 *(a_puDst) = uResult; \
1517 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1518 if (uResult) \
1519 fEfl |= g_afParity[uResult]; \
1520 else \
1521 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1522 if (!a_uSrc) \
1523 fEfl |= X86_EFL_CF; \
1524 *(a_pfEFlags) = fEfl; \
1525 } while (0)
1526#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1527 unsigned const uResult = (a_uResult); \
1528 *(a_puDst) = uResult; \
1529 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1530 if (!uResult) \
1531 fEfl |= X86_EFL_ZF; \
1532 if (!a_uSrc) \
1533 fEfl |= X86_EFL_CF; \
1534 *(a_pfEFlags) = fEfl; \
1535 } while (0)
1536
1537
1538/*
1539 * LZCNT - count leading zero bits.
1540 */
1541IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1542{
1543 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1544}
1545
1546IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1547{
1548 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1549}
1550
1551IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1552{
1553 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1554}
1555
1556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1557
1558IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1559{
1560 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1561}
1562
1563IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1564{
1565 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1566}
1567
1568IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1569{
1570 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1571}
1572
1573
1574IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1575{
1576 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1577}
1578
1579IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1580{
1581 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1582}
1583
1584IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1585{
1586 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1587}
1588
1589# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1590
1591
1592/*
1593 * TZCNT - count leading zero bits.
1594 */
1595IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1596{
1597 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1598}
1599
1600IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1601{
1602 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1603}
1604
1605IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1606{
1607 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1608}
1609
1610# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1611
1612IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1613{
1614 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1615}
1616
1617IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1618{
1619 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1620}
1621
1622IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1623{
1624 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1625}
1626
1627
1628IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1629{
1630 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1631}
1632
1633IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1634{
1635 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1636}
1637
1638IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1639{
1640 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1641}
1642
1643# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1644#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1645
1646/*
1647 * BEXTR (BMI1 instruction)
1648 */
1649#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1650IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1651 a_Type uSrc2, uint32_t *pfEFlags)) \
1652{ \
1653 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1654 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1655 a_Type uResult; \
1656 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1657 if (iFirstBit < a_cBits) \
1658 { \
1659 uResult = uSrc1 >> iFirstBit; \
1660 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1661 if (cBits < a_cBits) \
1662 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1663 *puDst = uResult; \
1664 if (!uResult) \
1665 fEfl |= X86_EFL_ZF; \
1666 } \
1667 else \
1668 { \
1669 *puDst = uResult = 0; \
1670 fEfl |= X86_EFL_ZF; \
1671 } \
1672 /** @todo complete flag calculations. */ \
1673 *pfEFlags = fEfl; \
1674}
1675
1676EMIT_BEXTR(64, uint64_t, _fallback)
1677EMIT_BEXTR(32, uint32_t, _fallback)
1678#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1679EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1680#endif
1681#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1682EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1683#endif
1684
1685/*
1686 * BLSR (BMI1 instruction)
1687 */
1688#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1689IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1690{ \
1691 uint32_t fEfl1 = *pfEFlags; \
1692 uint32_t fEfl2 = fEfl1; \
1693 *puDst = uSrc; \
1694 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1695 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1696 \
1697 /* AMD: The carry flag is from the SUB operation. */ \
1698 /* 10890xe: PF always cleared? */ \
1699 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1700 fEfl2 |= fEfl1 & X86_EFL_CF; \
1701 *pfEFlags = fEfl2; \
1702}
1703
1704EMIT_BLSR(64, uint64_t, _fallback)
1705EMIT_BLSR(32, uint32_t, _fallback)
1706#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BLSR(64, uint64_t, RT_NOTHING)
1708#endif
1709#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1710EMIT_BLSR(32, uint32_t, RT_NOTHING)
1711#endif
1712
1713/*
1714 * BLSMSK (BMI1 instruction)
1715 */
1716#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1717IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1718{ \
1719 uint32_t fEfl1 = *pfEFlags; \
1720 uint32_t fEfl2 = fEfl1; \
1721 *puDst = uSrc; \
1722 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1723 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1724 \
1725 /* AMD: The carry flag is from the SUB operation. */ \
1726 /* 10890xe: PF always cleared? */ \
1727 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1728 fEfl2 |= fEfl1 & X86_EFL_CF; \
1729 *pfEFlags = fEfl2; \
1730}
1731
1732EMIT_BLSMSK(64, uint64_t, _fallback)
1733EMIT_BLSMSK(32, uint32_t, _fallback)
1734#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1736#endif
1737#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1738EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1739#endif
1740
1741/*
1742 * BLSI (BMI1 instruction)
1743 */
1744#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1745IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1746{ \
1747 uint32_t fEfl1 = *pfEFlags; \
1748 uint32_t fEfl2 = fEfl1; \
1749 *puDst = uSrc; \
1750 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1751 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1752 \
1753 /* AMD: The carry flag is from the SUB operation. */ \
1754 /* 10890xe: PF always cleared? */ \
1755 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1756 fEfl2 |= fEfl1 & X86_EFL_CF; \
1757 *pfEFlags = fEfl2; \
1758}
1759
1760EMIT_BLSI(64, uint64_t, _fallback)
1761EMIT_BLSI(32, uint32_t, _fallback)
1762#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSI(64, uint64_t, RT_NOTHING)
1764#endif
1765#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1766EMIT_BLSI(32, uint32_t, RT_NOTHING)
1767#endif
1768
1769/*
1770 * BZHI (BMI2 instruction)
1771 */
1772#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1773IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1774 a_Type uSrc2, uint32_t *pfEFlags)) \
1775{ \
1776 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1777 a_Type uResult; \
1778 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1779 if (iFirstBit < a_cBits) \
1780 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1781 else \
1782 { \
1783 uResult = uSrc1; \
1784 fEfl |= X86_EFL_CF; \
1785 } \
1786 *puDst = uResult; \
1787 fEfl |= X86_EFL_CALC_ZF(uResult); \
1788 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1789 *pfEFlags = fEfl; \
1790}
1791
1792EMIT_BZHI(64, uint64_t, _fallback)
1793EMIT_BZHI(32, uint32_t, _fallback)
1794#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1795EMIT_BZHI(64, uint64_t, RT_NOTHING)
1796#endif
1797#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1798EMIT_BZHI(32, uint32_t, RT_NOTHING)
1799#endif
1800
1801/*
1802 * POPCNT
1803 */
1804RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1805{
1806 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1809 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1810};
1811
1812/** @todo Use native popcount where possible and employ some more efficient
1813 * algorithm here (or in asm.h fallback)! */
1814
1815DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1816{
1817 return g_abBitCounts6[ u16 & 0x3f]
1818 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1819 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1820}
1821
1822DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1823{
1824 return g_abBitCounts6[ u32 & 0x3f]
1825 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1829 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1830}
1831
1832DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1833{
1834 return g_abBitCounts6[ u64 & 0x3f]
1835 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1844 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1845}
1846
1847#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1848IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1849{ \
1850 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1851 a_Type uResult; \
1852 if (uSrc) \
1853 uResult = iemPopCountU ## a_cBits(uSrc); \
1854 else \
1855 { \
1856 fEfl |= X86_EFL_ZF; \
1857 uResult = 0; \
1858 } \
1859 *puDst = uResult; \
1860 *pfEFlags = fEfl; \
1861}
1862
1863EMIT_POPCNT(64, uint64_t, _fallback)
1864EMIT_POPCNT(32, uint32_t, _fallback)
1865EMIT_POPCNT(16, uint16_t, _fallback)
1866#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1867EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1868#endif
1869#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1870EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1871EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1872#endif
1873
1874
1875#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1876
1877/*
1878 * XCHG
1879 */
1880
1881IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1882{
1883#if ARCH_BITS >= 64
1884 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1885#else
1886 uint64_t uOldMem = *puMem;
1887 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1888 ASMNopPause();
1889 *puReg = uOldMem;
1890#endif
1891}
1892
1893# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1894
1895IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1896{
1897 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1898}
1899
1900
1901IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1902{
1903 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1904}
1905
1906
1907IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1908{
1909 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1910}
1911
1912# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1913
1914
1915/* Unlocked variants for fDisregardLock mode: */
1916
1917IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1918{
1919 uint64_t const uOld = *puMem;
1920 *puMem = *puReg;
1921 *puReg = uOld;
1922}
1923
1924# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1927{
1928 uint32_t const uOld = *puMem;
1929 *puMem = *puReg;
1930 *puReg = uOld;
1931}
1932
1933
1934IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1935{
1936 uint16_t const uOld = *puMem;
1937 *puMem = *puReg;
1938 *puReg = uOld;
1939}
1940
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1943{
1944 uint8_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1950
1951
1952/*
1953 * XADD and LOCK XADD.
1954 */
1955#define EMIT_XADD(a_cBitsWidth, a_Type) \
1956IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1957{ \
1958 a_Type uDst = *puDst; \
1959 a_Type uResult = uDst; \
1960 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1961 *puDst = uResult; \
1962 *puReg = uDst; \
1963} \
1964\
1965IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1966{ \
1967 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1968 a_Type uResult; \
1969 uint32_t fEflTmp; \
1970 do \
1971 { \
1972 uResult = uOld; \
1973 fEflTmp = *pfEFlags; \
1974 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1975 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1976 *puReg = uOld; \
1977 *pfEFlags = fEflTmp; \
1978}
1979EMIT_XADD(64, uint64_t)
1980# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1981EMIT_XADD(32, uint32_t)
1982EMIT_XADD(16, uint16_t)
1983EMIT_XADD(8, uint8_t)
1984# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1985
1986#endif
1987
1988/*
1989 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1990 *
1991 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1992 * instructions are emulated as locked.
1993 */
1994#if defined(IEM_WITHOUT_ASSEMBLY)
1995
1996IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1997{
1998 uint8_t uOld = *puAl;
1999 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2000 Assert(*puAl == uOld);
2001 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2002}
2003
2004
2005IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2006{
2007 uint16_t uOld = *puAx;
2008 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2009 Assert(*puAx == uOld);
2010 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2011}
2012
2013
2014IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2015{
2016 uint32_t uOld = *puEax;
2017 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2018 Assert(*puEax == uOld);
2019 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2020}
2021
2022
2023# if ARCH_BITS == 32
2024IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2025# else
2026IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2027# endif
2028{
2029# if ARCH_BITS == 32
2030 uint64_t const uSrcReg = *puSrcReg;
2031# endif
2032 uint64_t uOld = *puRax;
2033 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2034 Assert(*puRax == uOld);
2035 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2040 uint32_t *pEFlags))
2041{
2042 uint64_t const uNew = pu64EbxEcx->u;
2043 uint64_t const uOld = pu64EaxEdx->u;
2044 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2045 {
2046 Assert(pu64EaxEdx->u == uOld);
2047 *pEFlags |= X86_EFL_ZF;
2048 }
2049 else
2050 *pEFlags &= ~X86_EFL_ZF;
2051}
2052
2053
2054# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2055IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2056 uint32_t *pEFlags))
2057{
2058# ifdef VBOX_STRICT
2059 RTUINT128U const uOld = *pu128RaxRdx;
2060# endif
2061# if defined(RT_ARCH_AMD64)
2062 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2063 &pu128RaxRdx->u))
2064# else
2065 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2066# endif
2067 {
2068 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2069 *pEFlags |= X86_EFL_ZF;
2070 }
2071 else
2072 *pEFlags &= ~X86_EFL_ZF;
2073}
2074# endif
2075
2076#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2077
2078# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2079IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2080 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2081{
2082 RTUINT128U u128Tmp = *pu128Dst;
2083 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2084 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2085 {
2086 *pu128Dst = *pu128RbxRcx;
2087 *pEFlags |= X86_EFL_ZF;
2088 }
2089 else
2090 {
2091 *pu128RaxRdx = u128Tmp;
2092 *pEFlags &= ~X86_EFL_ZF;
2093 }
2094}
2095#endif /* !RT_ARCH_ARM64 */
2096
2097#if defined(IEM_WITHOUT_ASSEMBLY)
2098
2099/* Unlocked versions mapped to the locked ones: */
2100
2101IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2102{
2103 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2104}
2105
2106
2107IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2108{
2109 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2110}
2111
2112
2113IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2114{
2115 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2116}
2117
2118
2119# if ARCH_BITS == 32
2120IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2121{
2122 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2123}
2124# else
2125IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2126{
2127 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2128}
2129# endif
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2139 uint32_t *pEFlags))
2140{
2141 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2142}
2143
2144#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2145
2146#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2147 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2148
2149/*
2150 * MUL, IMUL, DIV and IDIV helpers.
2151 *
2152 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2153 * division step so we can select between using C operators and
2154 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2155 *
2156 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2157 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2158 * input loads and the result storing.
2159 */
2160
2161DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2162{
2163# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2164 pQuotient->s.Lo = 0;
2165 pQuotient->s.Hi = 0;
2166# endif
2167 RTUINT128U Divisor;
2168 Divisor.s.Lo = u64Divisor;
2169 Divisor.s.Hi = 0;
2170 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2171}
2172
2173# define DIV_LOAD(a_Dividend) \
2174 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2175# define DIV_LOAD_U8(a_Dividend) \
2176 a_Dividend.u = *puAX
2177
2178# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2179# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2180
2181# define MUL_LOAD_F1() *puA
2182# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2183
2184# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2185# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2186
2187# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2188 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2189# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2190 RTUInt128AssignNeg(&(a_Value))
2191
2192# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2193 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2194# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2195 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2196
2197# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2198 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2199 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2200# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2201 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2202
2203
2204/*
2205 * MUL
2206 */
2207# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2208IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2209{ \
2210 RTUINT ## a_cBitsWidth2x ## U Result; \
2211 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2212 a_fnStore(Result); \
2213 \
2214 /* Calc EFLAGS: */ \
2215 uint32_t fEfl = *pfEFlags; \
2216 if (a_fIntelFlags) \
2217 { /* Intel: 6700K and 10980XE behavior */ \
2218 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2219 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2220 fEfl |= X86_EFL_SF; \
2221 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2222 if (Result.s.Hi != 0) \
2223 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2224 } \
2225 else \
2226 { /* AMD: 3990X */ \
2227 if (Result.s.Hi != 0) \
2228 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2229 else \
2230 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2231 } \
2232 *pfEFlags = fEfl; \
2233 return 0; \
2234} \
2235
2236# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2239 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2240
2241# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2242EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2243 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2244# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2245EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2246 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2247EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2248 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2249EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2250 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2251# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2252# endif /* !DOXYGEN_RUNNING */
2253
2254/*
2255 * MULX
2256 */
2257# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2258IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2259 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2260{ \
2261 RTUINT ## a_cBitsWidth2x ## U Result; \
2262 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2263 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2264 *puDst1 = Result.s.Hi; \
2265} \
2266
2267# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2269EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2270# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2272EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2273# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2274# endif /* !DOXYGEN_RUNNING */
2275
2276
2277/*
2278 * IMUL
2279 *
2280 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2281 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2282 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2283 */
2284# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2285 a_Suffix, a_fIntelFlags) \
2286IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2287{ \
2288 RTUINT ## a_cBitsWidth2x ## U Result; \
2289 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2290 \
2291 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2292 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2293 { \
2294 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2295 { \
2296 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2297 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2298 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2299 } \
2300 else \
2301 { \
2302 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2303 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2304 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2305 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2306 a_fnNeg(Result, a_cBitsWidth2x); \
2307 } \
2308 } \
2309 else \
2310 { \
2311 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2312 { \
2313 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2314 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2315 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2316 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2317 a_fnNeg(Result, a_cBitsWidth2x); \
2318 } \
2319 else \
2320 { \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2322 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2323 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2324 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2325 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2326 } \
2327 } \
2328 a_fnStore(Result); \
2329 \
2330 if (a_fIntelFlags) \
2331 { \
2332 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2333 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2334 fEfl |= X86_EFL_SF; \
2335 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2336 } \
2337 *pfEFlags = fEfl; \
2338 return 0; \
2339}
2340# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2343 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2344
2345# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2346EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2347 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2348# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2349EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2350 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2351EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2352 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2353EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2354 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2355# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2356# endif /* !DOXYGEN_RUNNING */
2357
2358
2359/*
2360 * IMUL with two operands are mapped onto the three operand variant, ignoring
2361 * the high part of the product.
2362 */
2363# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2364IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2365{ \
2366 a_uType uIgn; \
2367 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2368} \
2369\
2370IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2371{ \
2372 a_uType uIgn; \
2373 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2374} \
2375\
2376IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2377{ \
2378 a_uType uIgn; \
2379 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2380}
2381
2382EMIT_IMUL_TWO(64, uint64_t)
2383# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2384EMIT_IMUL_TWO(32, uint32_t)
2385EMIT_IMUL_TWO(16, uint16_t)
2386# endif
2387
2388
2389/*
2390 * DIV
2391 */
2392# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2393 a_Suffix, a_fIntelFlags) \
2394IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2395{ \
2396 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2397 a_fnLoad(Dividend); \
2398 if ( uDivisor != 0 \
2399 && Dividend.s.Hi < uDivisor) \
2400 { \
2401 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2402 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2403 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2404 \
2405 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2406 if (!a_fIntelFlags) \
2407 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2408 return 0; \
2409 } \
2410 /* #DE */ \
2411 return -1; \
2412}
2413# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2416 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2417
2418# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2419EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2420 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2421# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2422EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2423 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2424EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2425 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2426EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2427 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2428# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2429# endif /* !DOXYGEN_RUNNING */
2430
2431
2432/*
2433 * IDIV
2434 *
2435 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2436 * set AF and clear PF, ZF and SF just like it does for DIV.
2437 *
2438 */
2439# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2440 a_Suffix, a_fIntelFlags) \
2441IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2442{ \
2443 /* Note! Skylake leaves all flags alone. */ \
2444 \
2445 /** @todo overflow checks */ \
2446 if (uDivisor != 0) \
2447 { \
2448 /* \
2449 * Convert to unsigned division. \
2450 */ \
2451 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2452 a_fnLoad(Dividend); \
2453 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2454 if (fSignedDividend) \
2455 a_fnNeg(Dividend, a_cBitsWidth2x); \
2456 \
2457 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2458 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2459 uDivisorPositive = uDivisor; \
2460 else \
2461 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2462 \
2463 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2464 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2465 \
2466 /* \
2467 * Setup the result, checking for overflows. \
2468 */ \
2469 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2470 { \
2471 if (!fSignedDividend) \
2472 { \
2473 /* Positive divisor, positive dividend => result positive. */ \
2474 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2475 { \
2476 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2477 if (!a_fIntelFlags) \
2478 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2479 return 0; \
2480 } \
2481 } \
2482 else \
2483 { \
2484 /* Positive divisor, negative dividend => result negative. */ \
2485 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2486 { \
2487 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2488 if (!a_fIntelFlags) \
2489 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2490 return 0; \
2491 } \
2492 } \
2493 } \
2494 else \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2500 { \
2501 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2511 { \
2512 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 } \
2520 /* #DE */ \
2521 return -1; \
2522}
2523# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2526 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2527
2528# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2529EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2530 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2531# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2532EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2533 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2534EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2535 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2536EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2537 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2538# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2539# endif /* !DOXYGEN_RUNNING */
2540
2541#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2542
2543
2544/*********************************************************************************************************************************
2545* Unary operations. *
2546*********************************************************************************************************************************/
2547#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2548
2549/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2550 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2551 *
2552 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2553 * borrowing in arithmetic loops on intel 8008).
2554 *
2555 * @returns Status bits.
2556 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2557 * @param a_uResult Unsigned result value.
2558 * @param a_uDst The original destination value (for AF calc).
2559 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2560 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2561 */
2562#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2563 do { \
2564 uint32_t fEflTmp = *(a_pfEFlags); \
2565 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2566 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2567 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2568 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2569 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2570 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2571 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2572 *(a_pfEFlags) = fEflTmp; \
2573 } while (0)
2574
2575/*
2576 * INC
2577 */
2578
2579IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2580{
2581 uint64_t uDst = *puDst;
2582 uint64_t uResult = uDst + 1;
2583 *puDst = uResult;
2584 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2585}
2586
2587# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2588
2589IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2590{
2591 uint32_t uDst = *puDst;
2592 uint32_t uResult = uDst + 1;
2593 *puDst = uResult;
2594 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2595}
2596
2597
2598IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2599{
2600 uint16_t uDst = *puDst;
2601 uint16_t uResult = uDst + 1;
2602 *puDst = uResult;
2603 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2604}
2605
2606IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2607{
2608 uint8_t uDst = *puDst;
2609 uint8_t uResult = uDst + 1;
2610 *puDst = uResult;
2611 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2612}
2613
2614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2615
2616
2617/*
2618 * DEC
2619 */
2620
2621IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2622{
2623 uint64_t uDst = *puDst;
2624 uint64_t uResult = uDst - 1;
2625 *puDst = uResult;
2626 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2627}
2628
2629# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint32_t uDst = *puDst;
2634 uint32_t uResult = uDst - 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2637}
2638
2639
2640IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2641{
2642 uint16_t uDst = *puDst;
2643 uint16_t uResult = uDst - 1;
2644 *puDst = uResult;
2645 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2646}
2647
2648
2649IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2650{
2651 uint8_t uDst = *puDst;
2652 uint8_t uResult = uDst - 1;
2653 *puDst = uResult;
2654 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2655}
2656
2657# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2658
2659
2660/*
2661 * NOT
2662 */
2663
2664IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2665{
2666 uint64_t uDst = *puDst;
2667 uint64_t uResult = ~uDst;
2668 *puDst = uResult;
2669 /* EFLAGS are not modified. */
2670 RT_NOREF_PV(pfEFlags);
2671}
2672
2673# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2674
2675IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2676{
2677 uint32_t uDst = *puDst;
2678 uint32_t uResult = ~uDst;
2679 *puDst = uResult;
2680 /* EFLAGS are not modified. */
2681 RT_NOREF_PV(pfEFlags);
2682}
2683
2684IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2685{
2686 uint16_t uDst = *puDst;
2687 uint16_t uResult = ~uDst;
2688 *puDst = uResult;
2689 /* EFLAGS are not modified. */
2690 RT_NOREF_PV(pfEFlags);
2691}
2692
2693IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2694{
2695 uint8_t uDst = *puDst;
2696 uint8_t uResult = ~uDst;
2697 *puDst = uResult;
2698 /* EFLAGS are not modified. */
2699 RT_NOREF_PV(pfEFlags);
2700}
2701
2702# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2703
2704
2705/*
2706 * NEG
2707 */
2708
2709/**
2710 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2711 *
2712 * @returns Status bits.
2713 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2714 * @param a_uResult Unsigned result value.
2715 * @param a_uDst The original destination value (for AF calc).
2716 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2717 */
2718#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2719 do { \
2720 uint32_t fEflTmp = *(a_pfEFlags); \
2721 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2722 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2723 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2724 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2725 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2726 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2727 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2728 *(a_pfEFlags) = fEflTmp; \
2729 } while (0)
2730
2731IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2732{
2733 uint64_t uDst = *puDst;
2734 uint64_t uResult = (uint64_t)0 - uDst;
2735 *puDst = uResult;
2736 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2737}
2738
2739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2740
2741IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2742{
2743 uint32_t uDst = *puDst;
2744 uint32_t uResult = (uint32_t)0 - uDst;
2745 *puDst = uResult;
2746 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2747}
2748
2749
2750IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2751{
2752 uint16_t uDst = *puDst;
2753 uint16_t uResult = (uint16_t)0 - uDst;
2754 *puDst = uResult;
2755 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2756}
2757
2758
2759IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2760{
2761 uint8_t uDst = *puDst;
2762 uint8_t uResult = (uint8_t)0 - uDst;
2763 *puDst = uResult;
2764 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2765}
2766
2767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2768
2769/*
2770 * Locked variants.
2771 */
2772
2773/** Emit a function for doing a locked unary operand operation. */
2774# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2775 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2776 uint32_t *pfEFlags)) \
2777 { \
2778 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2779 uint ## a_cBitsWidth ## _t uTmp; \
2780 uint32_t fEflTmp; \
2781 do \
2782 { \
2783 uTmp = uOld; \
2784 fEflTmp = *pfEFlags; \
2785 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2786 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2787 *pfEFlags = fEflTmp; \
2788 }
2789
2790EMIT_LOCKED_UNARY_OP(inc, 64)
2791EMIT_LOCKED_UNARY_OP(dec, 64)
2792EMIT_LOCKED_UNARY_OP(not, 64)
2793EMIT_LOCKED_UNARY_OP(neg, 64)
2794# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2795EMIT_LOCKED_UNARY_OP(inc, 32)
2796EMIT_LOCKED_UNARY_OP(dec, 32)
2797EMIT_LOCKED_UNARY_OP(not, 32)
2798EMIT_LOCKED_UNARY_OP(neg, 32)
2799
2800EMIT_LOCKED_UNARY_OP(inc, 16)
2801EMIT_LOCKED_UNARY_OP(dec, 16)
2802EMIT_LOCKED_UNARY_OP(not, 16)
2803EMIT_LOCKED_UNARY_OP(neg, 16)
2804
2805EMIT_LOCKED_UNARY_OP(inc, 8)
2806EMIT_LOCKED_UNARY_OP(dec, 8)
2807EMIT_LOCKED_UNARY_OP(not, 8)
2808EMIT_LOCKED_UNARY_OP(neg, 8)
2809# endif
2810
2811#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2812
2813
2814/*********************************************************************************************************************************
2815* Shifting and Rotating *
2816*********************************************************************************************************************************/
2817
2818/*
2819 * ROL
2820 */
2821#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2822IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2823{ \
2824 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2825 if (cShift) \
2826 { \
2827 if (a_cBitsWidth < 32) \
2828 cShift &= a_cBitsWidth - 1; \
2829 a_uType const uDst = *puDst; \
2830 a_uType const uResult = a_fnHlp(uDst, cShift); \
2831 *puDst = uResult; \
2832 \
2833 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2834 it the same way as for 1 bit shifts. */ \
2835 AssertCompile(X86_EFL_CF_BIT == 0); \
2836 uint32_t fEfl = *pfEFlags; \
2837 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2838 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2839 fEfl |= fCarry; \
2840 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2841 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2842 else /* Intel 10980XE: According to the first sub-shift: */ \
2843 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2844 *pfEFlags = fEfl; \
2845 } \
2846}
2847
2848#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2849EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2850#endif
2851EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2852EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2853
2854#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2855EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2856#endif
2857EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2858EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2859
2860DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2861{
2862 return (uValue << cShift) | (uValue >> (16 - cShift));
2863}
2864#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2865EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2866#endif
2867EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2868EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2869
2870DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2871{
2872 return (uValue << cShift) | (uValue >> (8 - cShift));
2873}
2874#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2875EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2876#endif
2877EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2878EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2879
2880
2881/*
2882 * ROR
2883 */
2884#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2885IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2886{ \
2887 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2888 if (cShift) \
2889 { \
2890 if (a_cBitsWidth < 32) \
2891 cShift &= a_cBitsWidth - 1; \
2892 a_uType const uDst = *puDst; \
2893 a_uType const uResult = a_fnHlp(uDst, cShift); \
2894 *puDst = uResult; \
2895 \
2896 /* Calc EFLAGS: */ \
2897 AssertCompile(X86_EFL_CF_BIT == 0); \
2898 uint32_t fEfl = *pfEFlags; \
2899 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2900 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2901 fEfl |= fCarry; \
2902 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2903 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2904 else /* Intel 10980XE: According to the first sub-shift: */ \
2905 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2906 *pfEFlags = fEfl; \
2907 } \
2908}
2909
2910#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2911EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2912#endif
2913EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2914EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2915
2916#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2917EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2918#endif
2919EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2920EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2921
2922DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2923{
2924 return (uValue >> cShift) | (uValue << (16 - cShift));
2925}
2926#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2927EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2928#endif
2929EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2930EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2931
2932DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2933{
2934 return (uValue >> cShift) | (uValue << (8 - cShift));
2935}
2936#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2937EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2938#endif
2939EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2940EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2941
2942
2943/*
2944 * RCL
2945 */
2946#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2947IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2948{ \
2949 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2950 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2951 cShift %= a_cBitsWidth + 1; \
2952 if (cShift) \
2953 { \
2954 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2955 cShift %= a_cBitsWidth + 1; \
2956 a_uType const uDst = *puDst; \
2957 a_uType uResult = uDst << cShift; \
2958 if (cShift > 1) \
2959 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2960 \
2961 AssertCompile(X86_EFL_CF_BIT == 0); \
2962 uint32_t fEfl = *pfEFlags; \
2963 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2964 uResult |= (a_uType)fInCarry << (cShift - 1); \
2965 \
2966 *puDst = uResult; \
2967 \
2968 /* Calc EFLAGS. */ \
2969 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2970 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2971 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2972 fEfl |= fOutCarry; \
2973 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2974 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2975 else /* Intel 10980XE: According to the first sub-shift: */ \
2976 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2977 *pfEFlags = fEfl; \
2978 } \
2979}
2980
2981#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2982EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2983#endif
2984EMIT_RCL(64, uint64_t, _intel, 1)
2985EMIT_RCL(64, uint64_t, _amd, 0)
2986
2987#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2988EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2989#endif
2990EMIT_RCL(32, uint32_t, _intel, 1)
2991EMIT_RCL(32, uint32_t, _amd, 0)
2992
2993#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2994EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2995#endif
2996EMIT_RCL(16, uint16_t, _intel, 1)
2997EMIT_RCL(16, uint16_t, _amd, 0)
2998
2999#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3000EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3001#endif
3002EMIT_RCL(8, uint8_t, _intel, 1)
3003EMIT_RCL(8, uint8_t, _amd, 0)
3004
3005
3006/*
3007 * RCR
3008 */
3009#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3010IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3011{ \
3012 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3013 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3014 cShift %= a_cBitsWidth + 1; \
3015 if (cShift) \
3016 { \
3017 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3018 cShift %= a_cBitsWidth + 1; \
3019 a_uType const uDst = *puDst; \
3020 a_uType uResult = uDst >> cShift; \
3021 if (cShift > 1) \
3022 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3023 \
3024 AssertCompile(X86_EFL_CF_BIT == 0); \
3025 uint32_t fEfl = *pfEFlags; \
3026 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3027 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3028 *puDst = uResult; \
3029 \
3030 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3031 it the same way as for 1 bit shifts. */ \
3032 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3033 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3034 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3035 fEfl |= fOutCarry; \
3036 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3037 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3038 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3039 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3040 *pfEFlags = fEfl; \
3041 } \
3042}
3043
3044#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3045EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3046#endif
3047EMIT_RCR(64, uint64_t, _intel, 1)
3048EMIT_RCR(64, uint64_t, _amd, 0)
3049
3050#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3051EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3052#endif
3053EMIT_RCR(32, uint32_t, _intel, 1)
3054EMIT_RCR(32, uint32_t, _amd, 0)
3055
3056#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3057EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3058#endif
3059EMIT_RCR(16, uint16_t, _intel, 1)
3060EMIT_RCR(16, uint16_t, _amd, 0)
3061
3062#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3063EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3064#endif
3065EMIT_RCR(8, uint8_t, _intel, 1)
3066EMIT_RCR(8, uint8_t, _amd, 0)
3067
3068
3069/*
3070 * SHL
3071 */
3072#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3073IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3074{ \
3075 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3076 if (cShift) \
3077 { \
3078 a_uType const uDst = *puDst; \
3079 a_uType uResult = uDst << cShift; \
3080 *puDst = uResult; \
3081 \
3082 /* Calc EFLAGS. */ \
3083 AssertCompile(X86_EFL_CF_BIT == 0); \
3084 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3085 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3086 fEfl |= fCarry; \
3087 if (!a_fIntelFlags) \
3088 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3089 else \
3090 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3091 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3092 fEfl |= X86_EFL_CALC_ZF(uResult); \
3093 fEfl |= g_afParity[uResult & 0xff]; \
3094 if (!a_fIntelFlags) \
3095 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3096 *pfEFlags = fEfl; \
3097 } \
3098}
3099
3100#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3101EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3102#endif
3103EMIT_SHL(64, uint64_t, _intel, 1)
3104EMIT_SHL(64, uint64_t, _amd, 0)
3105
3106#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3107EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3108#endif
3109EMIT_SHL(32, uint32_t, _intel, 1)
3110EMIT_SHL(32, uint32_t, _amd, 0)
3111
3112#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3113EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3114#endif
3115EMIT_SHL(16, uint16_t, _intel, 1)
3116EMIT_SHL(16, uint16_t, _amd, 0)
3117
3118#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3119EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3120#endif
3121EMIT_SHL(8, uint8_t, _intel, 1)
3122EMIT_SHL(8, uint8_t, _amd, 0)
3123
3124
3125/*
3126 * SHR
3127 */
3128#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3129IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3130{ \
3131 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3132 if (cShift) \
3133 { \
3134 a_uType const uDst = *puDst; \
3135 a_uType uResult = uDst >> cShift; \
3136 *puDst = uResult; \
3137 \
3138 /* Calc EFLAGS. */ \
3139 AssertCompile(X86_EFL_CF_BIT == 0); \
3140 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3141 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3142 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3143 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3144 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3145 fEfl |= X86_EFL_CALC_ZF(uResult); \
3146 fEfl |= g_afParity[uResult & 0xff]; \
3147 if (!a_fIntelFlags) \
3148 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3149 *pfEFlags = fEfl; \
3150 } \
3151}
3152
3153#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3154EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3155#endif
3156EMIT_SHR(64, uint64_t, _intel, 1)
3157EMIT_SHR(64, uint64_t, _amd, 0)
3158
3159#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3160EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3161#endif
3162EMIT_SHR(32, uint32_t, _intel, 1)
3163EMIT_SHR(32, uint32_t, _amd, 0)
3164
3165#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3166EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3167#endif
3168EMIT_SHR(16, uint16_t, _intel, 1)
3169EMIT_SHR(16, uint16_t, _amd, 0)
3170
3171#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3172EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3173#endif
3174EMIT_SHR(8, uint8_t, _intel, 1)
3175EMIT_SHR(8, uint8_t, _amd, 0)
3176
3177
3178/*
3179 * SAR
3180 */
3181#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3182IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3183{ \
3184 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3185 if (cShift) \
3186 { \
3187 a_iType const iDst = (a_iType)*puDst; \
3188 a_uType uResult = iDst >> cShift; \
3189 *puDst = uResult; \
3190 \
3191 /* Calc EFLAGS. \
3192 Note! The OF flag is always zero because the result never differs from the input. */ \
3193 AssertCompile(X86_EFL_CF_BIT == 0); \
3194 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3195 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3196 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3197 fEfl |= X86_EFL_CALC_ZF(uResult); \
3198 fEfl |= g_afParity[uResult & 0xff]; \
3199 if (!a_fIntelFlags) \
3200 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3201 *pfEFlags = fEfl; \
3202 } \
3203}
3204
3205#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3206EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3207#endif
3208EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3209EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3210
3211#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3212EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3213#endif
3214EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3215EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3216
3217#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3218EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3219#endif
3220EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3221EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3222
3223#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3224EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3225#endif
3226EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3227EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3228
3229
3230/*
3231 * SHLD
3232 *
3233 * - CF is the last bit shifted out of puDst.
3234 * - AF is always cleared by Intel 10980XE.
3235 * - AF is always set by AMD 3990X.
3236 * - OF is set according to the first shift on Intel 10980XE, it seems.
3237 * - OF is set according to the last sub-shift on AMD 3990X.
3238 * - ZF, SF and PF are calculated according to the result by both vendors.
3239 *
3240 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3241 * pick either the source register or the destination register for input bits
3242 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3243 * intel has changed behaviour here several times. We implement what current
3244 * skylake based does for now, we can extend this later as needed.
3245 */
3246#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3247IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3248 uint32_t *pfEFlags)) \
3249{ \
3250 cShift &= a_cBitsWidth - 1; \
3251 if (cShift) \
3252 { \
3253 a_uType const uDst = *puDst; \
3254 a_uType uResult = uDst << cShift; \
3255 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3256 *puDst = uResult; \
3257 \
3258 /* CALC EFLAGS: */ \
3259 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3260 if (a_fIntelFlags) \
3261 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3262 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3263 else \
3264 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3265 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3266 fEfl |= X86_EFL_AF; \
3267 } \
3268 AssertCompile(X86_EFL_CF_BIT == 0); \
3269 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3270 fEfl |= g_afParity[uResult & 0xff]; \
3271 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3272 fEfl |= X86_EFL_CALC_ZF(uResult); \
3273 *pfEFlags = fEfl; \
3274 } \
3275}
3276
3277#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3278EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3279#endif
3280EMIT_SHLD(64, uint64_t, _intel, 1)
3281EMIT_SHLD(64, uint64_t, _amd, 0)
3282
3283#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3284EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3285#endif
3286EMIT_SHLD(32, uint32_t, _intel, 1)
3287EMIT_SHLD(32, uint32_t, _amd, 0)
3288
3289#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3290IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3291{ \
3292 cShift &= 31; \
3293 if (cShift) \
3294 { \
3295 uint16_t const uDst = *puDst; \
3296 uint64_t const uTmp = a_fIntelFlags \
3297 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3298 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3299 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3300 *puDst = uResult; \
3301 \
3302 /* CALC EFLAGS: */ \
3303 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3304 AssertCompile(X86_EFL_CF_BIT == 0); \
3305 if (a_fIntelFlags) \
3306 { \
3307 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3308 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3309 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3310 } \
3311 else \
3312 { \
3313 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3314 if (cShift < 16) \
3315 { \
3316 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3317 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3318 } \
3319 else \
3320 { \
3321 if (cShift == 16) \
3322 fEfl |= uDst & X86_EFL_CF; \
3323 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3324 } \
3325 fEfl |= X86_EFL_AF; \
3326 } \
3327 fEfl |= g_afParity[uResult & 0xff]; \
3328 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3329 fEfl |= X86_EFL_CALC_ZF(uResult); \
3330 *pfEFlags = fEfl; \
3331 } \
3332}
3333
3334#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3335EMIT_SHLD_16(RT_NOTHING, 1)
3336#endif
3337EMIT_SHLD_16(_intel, 1)
3338EMIT_SHLD_16(_amd, 0)
3339
3340
3341/*
3342 * SHRD
3343 *
3344 * EFLAGS behaviour seems to be the same as with SHLD:
3345 * - CF is the last bit shifted out of puDst.
3346 * - AF is always cleared by Intel 10980XE.
3347 * - AF is always set by AMD 3990X.
3348 * - OF is set according to the first shift on Intel 10980XE, it seems.
3349 * - OF is set according to the last sub-shift on AMD 3990X.
3350 * - ZF, SF and PF are calculated according to the result by both vendors.
3351 *
3352 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3353 * pick either the source register or the destination register for input bits
3354 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3355 * intel has changed behaviour here several times. We implement what current
3356 * skylake based does for now, we can extend this later as needed.
3357 */
3358#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3359IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3360{ \
3361 cShift &= a_cBitsWidth - 1; \
3362 if (cShift) \
3363 { \
3364 a_uType const uDst = *puDst; \
3365 a_uType uResult = uDst >> cShift; \
3366 uResult |= uSrc << (a_cBitsWidth - cShift); \
3367 *puDst = uResult; \
3368 \
3369 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3370 AssertCompile(X86_EFL_CF_BIT == 0); \
3371 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3372 if (a_fIntelFlags) \
3373 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3374 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3375 else \
3376 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3377 if (cShift > 1) /* Set according to last shift. */ \
3378 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3379 else \
3380 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3381 fEfl |= X86_EFL_AF; \
3382 } \
3383 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3384 fEfl |= X86_EFL_CALC_ZF(uResult); \
3385 fEfl |= g_afParity[uResult & 0xff]; \
3386 *pfEFlags = fEfl; \
3387 } \
3388}
3389
3390#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3391EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3392#endif
3393EMIT_SHRD(64, uint64_t, _intel, 1)
3394EMIT_SHRD(64, uint64_t, _amd, 0)
3395
3396#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3397EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3398#endif
3399EMIT_SHRD(32, uint32_t, _intel, 1)
3400EMIT_SHRD(32, uint32_t, _amd, 0)
3401
3402#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3403IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3404{ \
3405 cShift &= 31; \
3406 if (cShift) \
3407 { \
3408 uint16_t const uDst = *puDst; \
3409 uint64_t const uTmp = a_fIntelFlags \
3410 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3411 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3412 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3413 *puDst = uResult; \
3414 \
3415 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3416 AssertCompile(X86_EFL_CF_BIT == 0); \
3417 if (a_fIntelFlags) \
3418 { \
3419 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3420 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3421 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3422 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3423 } \
3424 else \
3425 { \
3426 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3427 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3428 /* AMD 3990X: Set according to last shift. AF always set. */ \
3429 if (cShift > 1) /* Set according to last shift. */ \
3430 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3431 else \
3432 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3433 fEfl |= X86_EFL_AF; \
3434 } \
3435 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3436 fEfl |= X86_EFL_CALC_ZF(uResult); \
3437 fEfl |= g_afParity[uResult & 0xff]; \
3438 *pfEFlags = fEfl; \
3439 } \
3440}
3441
3442#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3443EMIT_SHRD_16(RT_NOTHING, 1)
3444#endif
3445EMIT_SHRD_16(_intel, 1)
3446EMIT_SHRD_16(_amd, 0)
3447
3448
3449/*
3450 * RORX (BMI2)
3451 */
3452#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3453IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3454{ \
3455 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3456}
3457
3458#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3459EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3460#endif
3461#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3462EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3463#endif
3464
3465
3466/*
3467 * SHLX (BMI2)
3468 */
3469#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3470IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3471{ \
3472 cShift &= a_cBitsWidth - 1; \
3473 *puDst = uSrc << cShift; \
3474}
3475
3476#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3477EMIT_SHLX(64, uint64_t, RT_NOTHING)
3478EMIT_SHLX(64, uint64_t, _fallback)
3479#endif
3480#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3481EMIT_SHLX(32, uint32_t, RT_NOTHING)
3482EMIT_SHLX(32, uint32_t, _fallback)
3483#endif
3484
3485
3486/*
3487 * SHRX (BMI2)
3488 */
3489#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3490IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3491{ \
3492 cShift &= a_cBitsWidth - 1; \
3493 *puDst = uSrc >> cShift; \
3494}
3495
3496#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3497EMIT_SHRX(64, uint64_t, RT_NOTHING)
3498EMIT_SHRX(64, uint64_t, _fallback)
3499#endif
3500#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3501EMIT_SHRX(32, uint32_t, RT_NOTHING)
3502EMIT_SHRX(32, uint32_t, _fallback)
3503#endif
3504
3505
3506/*
3507 * SARX (BMI2)
3508 */
3509#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3510IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3511{ \
3512 cShift &= a_cBitsWidth - 1; \
3513 *puDst = (a_iType)uSrc >> cShift; \
3514}
3515
3516#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3517EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3518EMIT_SARX(64, uint64_t, int64_t, _fallback)
3519#endif
3520#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3521EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3522EMIT_SARX(32, uint32_t, int32_t, _fallback)
3523#endif
3524
3525
3526/*
3527 * PDEP (BMI2)
3528 */
3529#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3530IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3531{ \
3532 a_uType uResult = 0; \
3533 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3534 if (fMask & ((a_uType)1 << iMaskBit)) \
3535 { \
3536 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3537 iBit++; \
3538 } \
3539 *puDst = uResult; \
3540}
3541
3542#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3543EMIT_PDEP(64, uint64_t, RT_NOTHING)
3544#endif
3545EMIT_PDEP(64, uint64_t, _fallback)
3546#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3547EMIT_PDEP(32, uint32_t, RT_NOTHING)
3548#endif
3549EMIT_PDEP(32, uint32_t, _fallback)
3550
3551/*
3552 * PEXT (BMI2)
3553 */
3554#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PEXT(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PEXT(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PEXT(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PEXT(32, uint32_t, _fallback)
3575
3576
3577#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3578
3579# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3580/*
3581 * BSWAP
3582 */
3583
3584IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3585{
3586 *puDst = ASMByteSwapU64(*puDst);
3587}
3588
3589
3590IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3591{
3592 *puDst = ASMByteSwapU32(*puDst);
3593}
3594
3595
3596/* Note! undocument, so 32-bit arg */
3597IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3598{
3599#if 0
3600 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3601#else
3602 /* This is the behaviour AMD 3990x (64-bit mode): */
3603 *(uint16_t *)puDst = 0;
3604#endif
3605}
3606
3607# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3608
3609
3610
3611# if defined(IEM_WITHOUT_ASSEMBLY)
3612
3613/*
3614 * LFENCE, SFENCE & MFENCE.
3615 */
3616
3617IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3618{
3619 ASMReadFence();
3620}
3621
3622
3623IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3624{
3625 ASMWriteFence();
3626}
3627
3628
3629IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3630{
3631 ASMMemoryFence();
3632}
3633
3634
3635# ifndef RT_ARCH_ARM64
3636IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3637{
3638 ASMMemoryFence();
3639}
3640# endif
3641
3642# endif
3643
3644#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3645
3646
3647IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3648{
3649 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3650 {
3651 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3652 *pu16Dst |= u16Src & X86_SEL_RPL;
3653
3654 *pfEFlags |= X86_EFL_ZF;
3655 }
3656 else
3657 *pfEFlags &= ~X86_EFL_ZF;
3658}
3659
3660
3661#if defined(IEM_WITHOUT_ASSEMBLY)
3662
3663/*********************************************************************************************************************************
3664* x87 FPU Loads *
3665*********************************************************************************************************************************/
3666
3667IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3668{
3669 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3670 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3671 {
3672 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3673 pFpuRes->r80Result.sj64.fInteger = 1;
3674 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3675 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3676 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3677 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3678 }
3679 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3680 {
3681 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3682 pFpuRes->r80Result.s.uExponent = 0;
3683 pFpuRes->r80Result.s.uMantissa = 0;
3684 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3685 }
3686 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3687 {
3688 /* Subnormal values gets normalized. */
3689 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3690 pFpuRes->r80Result.sj64.fInteger = 1;
3691 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3692 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3693 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3694 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3695 pFpuRes->FSW |= X86_FSW_DE;
3696 if (!(pFpuState->FCW & X86_FCW_DM))
3697 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3698 }
3699 else if (RTFLOAT32U_IS_INF(pr32Val))
3700 {
3701 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3702 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3703 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3704 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3705 }
3706 else
3707 {
3708 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3709 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3710 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3711 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3712 pFpuRes->r80Result.sj64.fInteger = 1;
3713 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3714 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3715 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3716 {
3717 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3718 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3719 pFpuRes->FSW |= X86_FSW_IE;
3720
3721 if (!(pFpuState->FCW & X86_FCW_IM))
3722 {
3723 /* The value is not pushed. */
3724 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3725 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3726 pFpuRes->r80Result.au64[0] = 0;
3727 pFpuRes->r80Result.au16[4] = 0;
3728 }
3729 }
3730 else
3731 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3732 }
3733}
3734
3735
3736IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3737{
3738 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3739 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3740 {
3741 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3742 pFpuRes->r80Result.sj64.fInteger = 1;
3743 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3744 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3745 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3746 }
3747 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3748 {
3749 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3750 pFpuRes->r80Result.s.uExponent = 0;
3751 pFpuRes->r80Result.s.uMantissa = 0;
3752 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3753 }
3754 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3755 {
3756 /* Subnormal values gets normalized. */
3757 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3758 pFpuRes->r80Result.sj64.fInteger = 1;
3759 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3760 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3761 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3762 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3763 pFpuRes->FSW |= X86_FSW_DE;
3764 if (!(pFpuState->FCW & X86_FCW_DM))
3765 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3766 }
3767 else if (RTFLOAT64U_IS_INF(pr64Val))
3768 {
3769 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3770 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3771 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3772 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3773 }
3774 else
3775 {
3776 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3777 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3778 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3779 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3780 pFpuRes->r80Result.sj64.fInteger = 1;
3781 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3782 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3783 {
3784 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3785 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3786 pFpuRes->FSW |= X86_FSW_IE;
3787
3788 if (!(pFpuState->FCW & X86_FCW_IM))
3789 {
3790 /* The value is not pushed. */
3791 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3792 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3793 pFpuRes->r80Result.au64[0] = 0;
3794 pFpuRes->r80Result.au16[4] = 0;
3795 }
3796 }
3797 else
3798 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3799 }
3800}
3801
3802
3803IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3804{
3805 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3806 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3807 /* Raises no exceptions. */
3808 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3809}
3810
3811
3812IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3813{
3814 pFpuRes->r80Result.sj64.fSign = 0;
3815 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3816 pFpuRes->r80Result.sj64.fInteger = 1;
3817 pFpuRes->r80Result.sj64.uFraction = 0;
3818
3819 /*
3820 * FPU status word:
3821 * - TOP is irrelevant, but we must match x86 assembly version.
3822 * - C1 is always cleared as we don't have any stack overflows.
3823 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3824 */
3825 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3826}
3827
3828
3829IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3830{
3831 pFpuRes->r80Result.sj64.fSign = 0;
3832 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3833 pFpuRes->r80Result.sj64.fInteger = 1;
3834 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3835 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3836 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3837 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3838}
3839
3840
3841IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3842{
3843 pFpuRes->r80Result.sj64.fSign = 0;
3844 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3845 pFpuRes->r80Result.sj64.fInteger = 1;
3846 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3847 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3848 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3849}
3850
3851
3852IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3853{
3854 pFpuRes->r80Result.sj64.fSign = 0;
3855 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3856 pFpuRes->r80Result.sj64.fInteger = 1;
3857 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3858 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3859 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3860 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3861}
3862
3863
3864IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3865{
3866 pFpuRes->r80Result.sj64.fSign = 0;
3867 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3868 pFpuRes->r80Result.sj64.fInteger = 1;
3869 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3870 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3871 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3872 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3873}
3874
3875
3876IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3877{
3878 pFpuRes->r80Result.sj64.fSign = 0;
3879 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3880 pFpuRes->r80Result.sj64.fInteger = 1;
3881 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3882 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3883 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3884 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3885}
3886
3887
3888IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3889{
3890 pFpuRes->r80Result.s.fSign = 0;
3891 pFpuRes->r80Result.s.uExponent = 0;
3892 pFpuRes->r80Result.s.uMantissa = 0;
3893 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3894}
3895
3896#define EMIT_FILD(a_cBits) \
3897IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3898 int ## a_cBits ## _t const *piVal)) \
3899{ \
3900 int ## a_cBits ## _t iVal = *piVal; \
3901 if (iVal == 0) \
3902 { \
3903 pFpuRes->r80Result.s.fSign = 0; \
3904 pFpuRes->r80Result.s.uExponent = 0; \
3905 pFpuRes->r80Result.s.uMantissa = 0; \
3906 } \
3907 else \
3908 { \
3909 if (iVal > 0) \
3910 pFpuRes->r80Result.s.fSign = 0; \
3911 else \
3912 { \
3913 pFpuRes->r80Result.s.fSign = 1; \
3914 iVal = -iVal; \
3915 } \
3916 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3917 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3918 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3919 } \
3920 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3921}
3922EMIT_FILD(16)
3923EMIT_FILD(32)
3924EMIT_FILD(64)
3925
3926
3927IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3928{
3929 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3930 if ( pd80Val->s.abPairs[0] == 0
3931 && pd80Val->s.abPairs[1] == 0
3932 && pd80Val->s.abPairs[2] == 0
3933 && pd80Val->s.abPairs[3] == 0
3934 && pd80Val->s.abPairs[4] == 0
3935 && pd80Val->s.abPairs[5] == 0
3936 && pd80Val->s.abPairs[6] == 0
3937 && pd80Val->s.abPairs[7] == 0
3938 && pd80Val->s.abPairs[8] == 0)
3939 {
3940 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3941 pFpuRes->r80Result.s.uExponent = 0;
3942 pFpuRes->r80Result.s.uMantissa = 0;
3943 }
3944 else
3945 {
3946 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3947
3948 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3949 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3950 cPairs--;
3951
3952 uint64_t uVal = 0;
3953 uint64_t uFactor = 1;
3954 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3955 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3956 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3957
3958 unsigned const cBits = ASMBitLastSetU64(uVal);
3959 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3960 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3961 }
3962}
3963
3964
3965/*********************************************************************************************************************************
3966* x87 FPU Stores *
3967*********************************************************************************************************************************/
3968
3969/**
3970 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3971 *
3972 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3973 *
3974 * @returns Updated FPU status word value.
3975 * @param fSignIn Incoming sign indicator.
3976 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3977 * @param iExponentIn Unbiased exponent.
3978 * @param fFcw The FPU control word.
3979 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3980 * @param pr32Dst Where to return the output value, if one should be
3981 * returned.
3982 *
3983 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3984 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3985 */
3986static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3987 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3988{
3989 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3990 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3991 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3992 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3993 ? fRoundingOffMask
3994 : 0;
3995 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3996
3997 /*
3998 * Deal with potential overflows/underflows first, optimizing for none.
3999 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4000 */
4001 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4002 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4003 { /* likely? */ }
4004 /*
4005 * Underflow if the exponent zero or negative. This is attempted mapped
4006 * to a subnormal number when possible, with some additional trickery ofc.
4007 */
4008 else if (iExponentOut <= 0)
4009 {
4010 bool const fIsTiny = iExponentOut < 0
4011 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4012 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4013 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4014 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4015
4016 if (iExponentOut <= 0)
4017 {
4018 uMantissaIn = iExponentOut <= -63
4019 ? uMantissaIn != 0
4020 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4021 fRoundedOff = uMantissaIn & fRoundingOffMask;
4022 if (fRoundedOff && fIsTiny)
4023 fFsw |= X86_FSW_UE;
4024 iExponentOut = 0;
4025 }
4026 }
4027 /*
4028 * Overflow if at or above max exponent value or if we will reach max
4029 * when rounding. Will return +/-zero or +/-max value depending on
4030 * whether we're rounding or not.
4031 */
4032 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4033 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4034 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4035 {
4036 fFsw |= X86_FSW_OE;
4037 if (!(fFcw & X86_FCW_OM))
4038 return fFsw | X86_FSW_ES | X86_FSW_B;
4039 fFsw |= X86_FSW_PE;
4040 if (uRoundingAdd)
4041 fFsw |= X86_FSW_C1;
4042 if (!(fFcw & X86_FCW_PM))
4043 fFsw |= X86_FSW_ES | X86_FSW_B;
4044
4045 pr32Dst->s.fSign = fSignIn;
4046 if (uRoundingAdd)
4047 { /* Zero */
4048 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4049 pr32Dst->s.uFraction = 0;
4050 }
4051 else
4052 { /* Max */
4053 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4054 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4055 }
4056 return fFsw;
4057 }
4058
4059 /*
4060 * Normal or subnormal number.
4061 */
4062 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4063 uint64_t uMantissaOut = uMantissaIn;
4064 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4065 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4066 || fRoundedOff != uRoundingAdd)
4067 {
4068 uMantissaOut = uMantissaIn + uRoundingAdd;
4069 if (uMantissaOut >= uMantissaIn)
4070 { /* likely */ }
4071 else
4072 {
4073 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4074 iExponentOut++;
4075 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4076 fFsw |= X86_FSW_C1;
4077 }
4078 }
4079 else
4080 uMantissaOut = uMantissaIn;
4081
4082 /* Truncate the mantissa and set the return value. */
4083 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4084
4085 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4086 pr32Dst->s.uExponent = iExponentOut;
4087 pr32Dst->s.fSign = fSignIn;
4088
4089 /* Set status flags realted to rounding. */
4090 if (fRoundedOff)
4091 {
4092 fFsw |= X86_FSW_PE;
4093 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4094 fFsw |= X86_FSW_C1;
4095 if (!(fFcw & X86_FCW_PM))
4096 fFsw |= X86_FSW_ES | X86_FSW_B;
4097 }
4098
4099 return fFsw;
4100}
4101
4102
4103/**
4104 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4105 */
4106IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4107 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4108{
4109 uint16_t const fFcw = pFpuState->FCW;
4110 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4111 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4112 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4113 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4114 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4115 {
4116 pr32Dst->s.fSign = pr80Src->s.fSign;
4117 pr32Dst->s.uExponent = 0;
4118 pr32Dst->s.uFraction = 0;
4119 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4120 }
4121 else if (RTFLOAT80U_IS_INF(pr80Src))
4122 {
4123 pr32Dst->s.fSign = pr80Src->s.fSign;
4124 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4125 pr32Dst->s.uFraction = 0;
4126 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4127 }
4128 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4129 {
4130 /* Mapped to +/-QNaN */
4131 pr32Dst->s.fSign = pr80Src->s.fSign;
4132 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4133 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4134 }
4135 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4136 {
4137 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4138 if (fFcw & X86_FCW_IM)
4139 {
4140 pr32Dst->s.fSign = 1;
4141 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4142 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4143 fFsw |= X86_FSW_IE;
4144 }
4145 else
4146 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4147 }
4148 else if (RTFLOAT80U_IS_NAN(pr80Src))
4149 {
4150 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4151 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4152 {
4153 pr32Dst->s.fSign = pr80Src->s.fSign;
4154 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4155 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4156 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4157 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4158 fFsw |= X86_FSW_IE;
4159 }
4160 else
4161 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4162 }
4163 else
4164 {
4165 /* Denormal values causes both an underflow and precision exception. */
4166 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4167 if (fFcw & X86_FCW_UM)
4168 {
4169 pr32Dst->s.fSign = pr80Src->s.fSign;
4170 pr32Dst->s.uExponent = 0;
4171 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4172 {
4173 pr32Dst->s.uFraction = 1;
4174 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4175 if (!(fFcw & X86_FCW_PM))
4176 fFsw |= X86_FSW_ES | X86_FSW_B;
4177 }
4178 else
4179 {
4180 pr32Dst->s.uFraction = 0;
4181 fFsw |= X86_FSW_UE | X86_FSW_PE;
4182 if (!(fFcw & X86_FCW_PM))
4183 fFsw |= X86_FSW_ES | X86_FSW_B;
4184 }
4185 }
4186 else
4187 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4188 }
4189 *pu16FSW = fFsw;
4190}
4191
4192
4193/**
4194 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4195 *
4196 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4197 *
4198 * @returns Updated FPU status word value.
4199 * @param fSignIn Incoming sign indicator.
4200 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4201 * @param iExponentIn Unbiased exponent.
4202 * @param fFcw The FPU control word.
4203 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4204 * @param pr64Dst Where to return the output value, if one should be
4205 * returned.
4206 *
4207 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4208 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4209 */
4210static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4211 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4212{
4213 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4214 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4215 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4216 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4217 ? fRoundingOffMask
4218 : 0;
4219 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4220
4221 /*
4222 * Deal with potential overflows/underflows first, optimizing for none.
4223 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4224 */
4225 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4226 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4227 { /* likely? */ }
4228 /*
4229 * Underflow if the exponent zero or negative. This is attempted mapped
4230 * to a subnormal number when possible, with some additional trickery ofc.
4231 */
4232 else if (iExponentOut <= 0)
4233 {
4234 bool const fIsTiny = iExponentOut < 0
4235 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4236 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4237 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4238 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4239
4240 if (iExponentOut <= 0)
4241 {
4242 uMantissaIn = iExponentOut <= -63
4243 ? uMantissaIn != 0
4244 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4245 fRoundedOff = uMantissaIn & fRoundingOffMask;
4246 if (fRoundedOff && fIsTiny)
4247 fFsw |= X86_FSW_UE;
4248 iExponentOut = 0;
4249 }
4250 }
4251 /*
4252 * Overflow if at or above max exponent value or if we will reach max
4253 * when rounding. Will return +/-zero or +/-max value depending on
4254 * whether we're rounding or not.
4255 */
4256 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4257 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4258 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4259 {
4260 fFsw |= X86_FSW_OE;
4261 if (!(fFcw & X86_FCW_OM))
4262 return fFsw | X86_FSW_ES | X86_FSW_B;
4263 fFsw |= X86_FSW_PE;
4264 if (uRoundingAdd)
4265 fFsw |= X86_FSW_C1;
4266 if (!(fFcw & X86_FCW_PM))
4267 fFsw |= X86_FSW_ES | X86_FSW_B;
4268
4269 pr64Dst->s64.fSign = fSignIn;
4270 if (uRoundingAdd)
4271 { /* Zero */
4272 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4273 pr64Dst->s64.uFraction = 0;
4274 }
4275 else
4276 { /* Max */
4277 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4278 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4279 }
4280 return fFsw;
4281 }
4282
4283 /*
4284 * Normal or subnormal number.
4285 */
4286 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4287 uint64_t uMantissaOut = uMantissaIn;
4288 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4289 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4290 || fRoundedOff != uRoundingAdd)
4291 {
4292 uMantissaOut = uMantissaIn + uRoundingAdd;
4293 if (uMantissaOut >= uMantissaIn)
4294 { /* likely */ }
4295 else
4296 {
4297 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4298 iExponentOut++;
4299 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4300 fFsw |= X86_FSW_C1;
4301 }
4302 }
4303 else
4304 uMantissaOut = uMantissaIn;
4305
4306 /* Truncate the mantissa and set the return value. */
4307 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4308
4309 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4310 pr64Dst->s64.uExponent = iExponentOut;
4311 pr64Dst->s64.fSign = fSignIn;
4312
4313 /* Set status flags realted to rounding. */
4314 if (fRoundedOff)
4315 {
4316 fFsw |= X86_FSW_PE;
4317 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4318 fFsw |= X86_FSW_C1;
4319 if (!(fFcw & X86_FCW_PM))
4320 fFsw |= X86_FSW_ES | X86_FSW_B;
4321 }
4322
4323 return fFsw;
4324}
4325
4326
4327/**
4328 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4329 */
4330IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4331 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4332{
4333 uint16_t const fFcw = pFpuState->FCW;
4334 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4335 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4336 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4337 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4338 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4339 {
4340 pr64Dst->s64.fSign = pr80Src->s.fSign;
4341 pr64Dst->s64.uExponent = 0;
4342 pr64Dst->s64.uFraction = 0;
4343 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4344 }
4345 else if (RTFLOAT80U_IS_INF(pr80Src))
4346 {
4347 pr64Dst->s64.fSign = pr80Src->s.fSign;
4348 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4349 pr64Dst->s64.uFraction = 0;
4350 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4351 }
4352 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4353 {
4354 /* Mapped to +/-QNaN */
4355 pr64Dst->s64.fSign = pr80Src->s.fSign;
4356 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4357 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4358 }
4359 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4360 {
4361 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4362 if (fFcw & X86_FCW_IM)
4363 {
4364 pr64Dst->s64.fSign = 1;
4365 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4366 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4367 fFsw |= X86_FSW_IE;
4368 }
4369 else
4370 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4371 }
4372 else if (RTFLOAT80U_IS_NAN(pr80Src))
4373 {
4374 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4375 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4376 {
4377 pr64Dst->s64.fSign = pr80Src->s.fSign;
4378 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4379 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4380 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4381 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4382 fFsw |= X86_FSW_IE;
4383 }
4384 else
4385 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4386 }
4387 else
4388 {
4389 /* Denormal values causes both an underflow and precision exception. */
4390 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4391 if (fFcw & X86_FCW_UM)
4392 {
4393 pr64Dst->s64.fSign = pr80Src->s.fSign;
4394 pr64Dst->s64.uExponent = 0;
4395 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4396 {
4397 pr64Dst->s64.uFraction = 1;
4398 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4399 if (!(fFcw & X86_FCW_PM))
4400 fFsw |= X86_FSW_ES | X86_FSW_B;
4401 }
4402 else
4403 {
4404 pr64Dst->s64.uFraction = 0;
4405 fFsw |= X86_FSW_UE | X86_FSW_PE;
4406 if (!(fFcw & X86_FCW_PM))
4407 fFsw |= X86_FSW_ES | X86_FSW_B;
4408 }
4409 }
4410 else
4411 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4412 }
4413 *pu16FSW = fFsw;
4414}
4415
4416
4417IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4418 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4419{
4420 /*
4421 * FPU status word:
4422 * - TOP is irrelevant, but we must match x86 assembly version (0).
4423 * - C1 is always cleared as we don't have any stack overflows.
4424 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4425 */
4426 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4427 *pr80Dst = *pr80Src;
4428}
4429
4430
4431/*
4432 *
4433 * Mantissa:
4434 * 63 56 48 40 32 24 16 8 0
4435 * v v v v v v v v v
4436 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4437 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4438 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4439 *
4440 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4441 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4442 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4443 * where we'll drop off all but bit 63.
4444 */
4445#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4446IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4447 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4448{ \
4449 uint16_t const fFcw = pFpuState->FCW; \
4450 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4451 bool const fSignIn = pr80Val->s.fSign; \
4452 \
4453 /* \
4454 * Deal with normal numbers first. \
4455 */ \
4456 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4457 { \
4458 uint64_t uMantissa = pr80Val->s.uMantissa; \
4459 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4460 \
4461 if ((uint32_t)iExponent <= a_cBits - 2) \
4462 { \
4463 unsigned const cShiftOff = 63 - iExponent; \
4464 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4465 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4466 ? RT_BIT_64(cShiftOff - 1) \
4467 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4468 ? fRoundingOffMask \
4469 : 0; \
4470 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4471 \
4472 uMantissa >>= cShiftOff; \
4473 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4474 uMantissa += uRounding; \
4475 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4476 { \
4477 if (fRoundedOff) \
4478 { \
4479 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4480 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4481 else if (uRounding) \
4482 fFsw |= X86_FSW_C1; \
4483 fFsw |= X86_FSW_PE; \
4484 if (!(fFcw & X86_FCW_PM)) \
4485 fFsw |= X86_FSW_ES | X86_FSW_B; \
4486 } \
4487 \
4488 if (!fSignIn) \
4489 *piDst = (a_iType)uMantissa; \
4490 else \
4491 *piDst = -(a_iType)uMantissa; \
4492 } \
4493 else \
4494 { \
4495 /* overflowed after rounding. */ \
4496 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4497 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4498 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4499 \
4500 /* Special case for the integer minimum value. */ \
4501 if (fSignIn) \
4502 { \
4503 *piDst = a_iTypeMin; \
4504 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4505 if (!(fFcw & X86_FCW_PM)) \
4506 fFsw |= X86_FSW_ES | X86_FSW_B; \
4507 } \
4508 else \
4509 { \
4510 fFsw |= X86_FSW_IE; \
4511 if (fFcw & X86_FCW_IM) \
4512 *piDst = a_iTypeMin; \
4513 else \
4514 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4515 } \
4516 } \
4517 } \
4518 /* \
4519 * Tiny sub-zero numbers. \
4520 */ \
4521 else if (iExponent < 0) \
4522 { \
4523 if (!fSignIn) \
4524 { \
4525 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4526 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4527 { \
4528 *piDst = 1; \
4529 fFsw |= X86_FSW_C1; \
4530 } \
4531 else \
4532 *piDst = 0; \
4533 } \
4534 else \
4535 { \
4536 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4537 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4538 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4539 *piDst = 0; \
4540 else \
4541 { \
4542 *piDst = -1; \
4543 fFsw |= X86_FSW_C1; \
4544 } \
4545 } \
4546 fFsw |= X86_FSW_PE; \
4547 if (!(fFcw & X86_FCW_PM)) \
4548 fFsw |= X86_FSW_ES | X86_FSW_B; \
4549 } \
4550 /* \
4551 * Special MIN case. \
4552 */ \
4553 else if ( fSignIn && iExponent == a_cBits - 1 \
4554 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4555 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4556 : uMantissa == RT_BIT_64(63))) \
4557 { \
4558 *piDst = a_iTypeMin; \
4559 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4560 { \
4561 fFsw |= X86_FSW_PE; \
4562 if (!(fFcw & X86_FCW_PM)) \
4563 fFsw |= X86_FSW_ES | X86_FSW_B; \
4564 } \
4565 } \
4566 /* \
4567 * Too large/small number outside the target integer range. \
4568 */ \
4569 else \
4570 { \
4571 fFsw |= X86_FSW_IE; \
4572 if (fFcw & X86_FCW_IM) \
4573 *piDst = a_iTypeIndefinite; \
4574 else \
4575 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4576 } \
4577 } \
4578 /* \
4579 * Map both +0 and -0 to integer zero (signless/+). \
4580 */ \
4581 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4582 *piDst = 0; \
4583 /* \
4584 * Denormals are just really tiny sub-zero numbers that are either rounded \
4585 * to zero, 1 or -1 depending on sign and rounding control. \
4586 */ \
4587 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4588 { \
4589 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4590 *piDst = 0; \
4591 else \
4592 { \
4593 *piDst = fSignIn ? -1 : 1; \
4594 fFsw |= X86_FSW_C1; \
4595 } \
4596 fFsw |= X86_FSW_PE; \
4597 if (!(fFcw & X86_FCW_PM)) \
4598 fFsw |= X86_FSW_ES | X86_FSW_B; \
4599 } \
4600 /* \
4601 * All other special values are considered invalid arguments and result \
4602 * in an IE exception and indefinite value if masked. \
4603 */ \
4604 else \
4605 { \
4606 fFsw |= X86_FSW_IE; \
4607 if (fFcw & X86_FCW_IM) \
4608 *piDst = a_iTypeIndefinite; \
4609 else \
4610 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4611 } \
4612 *pu16FSW = fFsw; \
4613}
4614EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4615EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4616EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4617
4618#endif /*IEM_WITHOUT_ASSEMBLY */
4619
4620
4621/*
4622 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4623 *
4624 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4625 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4626 * thus the @a a_cBitsIn.
4627 */
4628#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4629IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4630 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4631{ \
4632 uint16_t const fFcw = pFpuState->FCW; \
4633 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4634 bool const fSignIn = pr80Val->s.fSign; \
4635 \
4636 /* \
4637 * Deal with normal numbers first. \
4638 */ \
4639 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4640 { \
4641 uint64_t uMantissa = pr80Val->s.uMantissa; \
4642 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4643 \
4644 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4645 { \
4646 unsigned const cShiftOff = 63 - iExponent; \
4647 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4648 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4649 uMantissa >>= cShiftOff; \
4650 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4651 if (!fSignIn) \
4652 *piDst = (a_iType)uMantissa; \
4653 else \
4654 *piDst = -(a_iType)uMantissa; \
4655 \
4656 if (fRoundedOff) \
4657 { \
4658 fFsw |= X86_FSW_PE; \
4659 if (!(fFcw & X86_FCW_PM)) \
4660 fFsw |= X86_FSW_ES | X86_FSW_B; \
4661 } \
4662 } \
4663 /* \
4664 * Tiny sub-zero numbers. \
4665 */ \
4666 else if (iExponent < 0) \
4667 { \
4668 *piDst = 0; \
4669 fFsw |= X86_FSW_PE; \
4670 if (!(fFcw & X86_FCW_PM)) \
4671 fFsw |= X86_FSW_ES | X86_FSW_B; \
4672 } \
4673 /* \
4674 * Special MIN case. \
4675 */ \
4676 else if ( fSignIn && iExponent == a_cBits - 1 \
4677 && (a_cBits < 64 \
4678 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4679 : uMantissa == RT_BIT_64(63)) ) \
4680 { \
4681 *piDst = a_iTypeMin; \
4682 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4683 { \
4684 fFsw |= X86_FSW_PE; \
4685 if (!(fFcw & X86_FCW_PM)) \
4686 fFsw |= X86_FSW_ES | X86_FSW_B; \
4687 } \
4688 } \
4689 /* \
4690 * Figure this weirdness. \
4691 */ \
4692 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4693 { \
4694 *piDst = 0; \
4695 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4696 { \
4697 fFsw |= X86_FSW_PE; \
4698 if (!(fFcw & X86_FCW_PM)) \
4699 fFsw |= X86_FSW_ES | X86_FSW_B; \
4700 } \
4701 } \
4702 /* \
4703 * Too large/small number outside the target integer range. \
4704 */ \
4705 else \
4706 { \
4707 fFsw |= X86_FSW_IE; \
4708 if (fFcw & X86_FCW_IM) \
4709 *piDst = a_iTypeIndefinite; \
4710 else \
4711 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4712 } \
4713 } \
4714 /* \
4715 * Map both +0 and -0 to integer zero (signless/+). \
4716 */ \
4717 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4718 *piDst = 0; \
4719 /* \
4720 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4721 */ \
4722 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4723 { \
4724 *piDst = 0; \
4725 fFsw |= X86_FSW_PE; \
4726 if (!(fFcw & X86_FCW_PM)) \
4727 fFsw |= X86_FSW_ES | X86_FSW_B; \
4728 } \
4729 /* \
4730 * All other special values are considered invalid arguments and result \
4731 * in an IE exception and indefinite value if masked. \
4732 */ \
4733 else \
4734 { \
4735 fFsw |= X86_FSW_IE; \
4736 if (fFcw & X86_FCW_IM) \
4737 *piDst = a_iTypeIndefinite; \
4738 else \
4739 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4740 } \
4741 *pu16FSW = fFsw; \
4742}
4743#if defined(IEM_WITHOUT_ASSEMBLY)
4744EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4746EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4747#endif
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4749EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4750
4751
4752#if defined(IEM_WITHOUT_ASSEMBLY)
4753
4754IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4755 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4756{
4757 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4758 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4759 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4760 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4761 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4762
4763 uint16_t const fFcw = pFpuState->FCW;
4764 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4765 bool const fSignIn = pr80Src->s.fSign;
4766
4767 /*
4768 * Deal with normal numbers first.
4769 */
4770 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4771 {
4772 uint64_t uMantissa = pr80Src->s.uMantissa;
4773 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4774 if ( (uint32_t)iExponent <= 58
4775 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4776 {
4777 unsigned const cShiftOff = 63 - iExponent;
4778 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4779 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4780 ? RT_BIT_64(cShiftOff - 1)
4781 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4782 ? fRoundingOffMask
4783 : 0;
4784 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4785
4786 uMantissa >>= cShiftOff;
4787 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4788 uMantissa += uRounding;
4789 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4790 {
4791 if (fRoundedOff)
4792 {
4793 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4794 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4795 else if (uRounding)
4796 fFsw |= X86_FSW_C1;
4797 fFsw |= X86_FSW_PE;
4798 if (!(fFcw & X86_FCW_PM))
4799 fFsw |= X86_FSW_ES | X86_FSW_B;
4800 }
4801
4802 pd80Dst->s.fSign = fSignIn;
4803 pd80Dst->s.uPad = 0;
4804 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4805 {
4806 unsigned const uDigits = uMantissa % 100;
4807 uMantissa /= 100;
4808 uint8_t const bLo = uDigits % 10;
4809 uint8_t const bHi = uDigits / 10;
4810 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4811 }
4812 }
4813 else
4814 {
4815 /* overflowed after rounding. */
4816 fFsw |= X86_FSW_IE;
4817 if (fFcw & X86_FCW_IM)
4818 *pd80Dst = s_d80Indefinite;
4819 else
4820 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4821 }
4822 }
4823 /*
4824 * Tiny sub-zero numbers.
4825 */
4826 else if (iExponent < 0)
4827 {
4828 if (!fSignIn)
4829 {
4830 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4831 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4832 {
4833 *pd80Dst = s_ad80One[fSignIn];
4834 fFsw |= X86_FSW_C1;
4835 }
4836 else
4837 *pd80Dst = s_ad80Zeros[fSignIn];
4838 }
4839 else
4840 {
4841 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4842 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4843 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4844 *pd80Dst = s_ad80Zeros[fSignIn];
4845 else
4846 {
4847 *pd80Dst = s_ad80One[fSignIn];
4848 fFsw |= X86_FSW_C1;
4849 }
4850 }
4851 fFsw |= X86_FSW_PE;
4852 if (!(fFcw & X86_FCW_PM))
4853 fFsw |= X86_FSW_ES | X86_FSW_B;
4854 }
4855 /*
4856 * Too large/small number outside the target integer range.
4857 */
4858 else
4859 {
4860 fFsw |= X86_FSW_IE;
4861 if (fFcw & X86_FCW_IM)
4862 *pd80Dst = s_d80Indefinite;
4863 else
4864 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4865 }
4866 }
4867 /*
4868 * Map both +0 and -0 to integer zero (signless/+).
4869 */
4870 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4871 *pd80Dst = s_ad80Zeros[fSignIn];
4872 /*
4873 * Denormals are just really tiny sub-zero numbers that are either rounded
4874 * to zero, 1 or -1 depending on sign and rounding control.
4875 */
4876 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4877 {
4878 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4879 *pd80Dst = s_ad80Zeros[fSignIn];
4880 else
4881 {
4882 *pd80Dst = s_ad80One[fSignIn];
4883 fFsw |= X86_FSW_C1;
4884 }
4885 fFsw |= X86_FSW_PE;
4886 if (!(fFcw & X86_FCW_PM))
4887 fFsw |= X86_FSW_ES | X86_FSW_B;
4888 }
4889 /*
4890 * All other special values are considered invalid arguments and result
4891 * in an IE exception and indefinite value if masked.
4892 */
4893 else
4894 {
4895 fFsw |= X86_FSW_IE;
4896 if (fFcw & X86_FCW_IM)
4897 *pd80Dst = s_d80Indefinite;
4898 else
4899 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4900 }
4901 *pu16FSW = fFsw;
4902}
4903
4904
4905/*********************************************************************************************************************************
4906* FPU Helpers *
4907*********************************************************************************************************************************/
4908AssertCompileSize(RTFLOAT128U, 16);
4909AssertCompileSize(RTFLOAT80U, 10);
4910AssertCompileSize(RTFLOAT64U, 8);
4911AssertCompileSize(RTFLOAT32U, 4);
4912
4913/**
4914 * Normalizes a possible pseudo-normal value.
4915 *
4916 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4917 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4918 * i.e. changing uExponent from 0 to 1.
4919 *
4920 * This macro will declare a RTFLOAT80U with the name given by
4921 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4922 * a normalization was performed.
4923 *
4924 * @note This must be applied before calling SoftFloat with a value that couldbe
4925 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4926 * correctly.
4927 */
4928#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4929 RTFLOAT80U a_r80ValNormalized; \
4930 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4931 { \
4932 a_r80ValNormalized = *a_pr80Val; \
4933 a_r80ValNormalized.s.uExponent = 1; \
4934 a_pr80Val = &a_r80ValNormalized; \
4935 } else do {} while (0)
4936
4937#ifdef IEM_WITH_FLOAT128_FOR_FPU
4938
4939DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4940{
4941 int fNew;
4942 switch (fFcw & X86_FCW_RC_MASK)
4943 {
4944 default:
4945 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4946 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4947 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4948 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4949 }
4950 int fOld = fegetround();
4951 fesetround(fNew);
4952 return fOld;
4953}
4954
4955
4956DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4957{
4958 fesetround(fOld);
4959}
4960
4961DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4962{
4963 RT_NOREF(fFcw);
4964 RTFLOAT128U Tmp;
4965 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4966 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4967 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4968 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4969 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4970 {
4971 Assert(Tmp.s.uExponent == 0);
4972 Tmp.s2.uSignAndExponent++;
4973 }
4974 return *(_Float128 *)&Tmp;
4975}
4976
4977
4978DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4979{
4980 RT_NOREF(fFcw);
4981 RTFLOAT128U Tmp;
4982 *(_Float128 *)&Tmp = rd128ValSrc;
4983 ASMCompilerBarrier();
4984 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4985 {
4986 pr80Dst->s.fSign = Tmp.s64.fSign;
4987 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4988 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4989 | Tmp.s64.uFractionLo >> (64 - 15);
4990
4991 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4992 unsigned const cShiftOff = 64 - 15;
4993 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4994 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4995 if (uRoundedOff)
4996 {
4997 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4998 ? RT_BIT_64(cShiftOff - 1)
4999 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5000 ? fRoundingOffMask
5001 : 0;
5002 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5003 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5004 || uRoundedOff != uRoundingAdd)
5005 {
5006 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5007 {
5008 uFraction += 1;
5009 if (!(uFraction & RT_BIT_64(63)))
5010 { /* likely */ }
5011 else
5012 {
5013 uFraction >>= 1;
5014 pr80Dst->s.uExponent++;
5015 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5016 return fFsw;
5017 }
5018 fFsw |= X86_FSW_C1;
5019 }
5020 }
5021 fFsw |= X86_FSW_PE;
5022 if (!(fFcw & X86_FCW_PM))
5023 fFsw |= X86_FSW_ES | X86_FSW_B;
5024 }
5025 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5026 }
5027 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5028 {
5029 pr80Dst->s.fSign = Tmp.s64.fSign;
5030 pr80Dst->s.uExponent = 0;
5031 pr80Dst->s.uMantissa = 0;
5032 }
5033 else if (RTFLOAT128U_IS_INF(&Tmp))
5034 {
5035 pr80Dst->s.fSign = Tmp.s64.fSign;
5036 pr80Dst->s.uExponent = 0;
5037 pr80Dst->s.uMantissa = 0;
5038 }
5039 return fFsw;
5040}
5041
5042
5043#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5044
5045/** Initializer for the SoftFloat state structure. */
5046# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5047 { \
5048 softfloat_tininess_afterRounding, \
5049 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5051 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5052 : (uint8_t)softfloat_round_minMag, \
5053 0, \
5054 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5055 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5056 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5057 }
5058
5059/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5060# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5061 ( (a_fFsw) \
5062 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5063 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5064 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5065 ? X86_FSW_ES | X86_FSW_B : 0) )
5066
5067
5068DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5069{
5070 RT_NOREF(fFcw);
5071 Assert(cBits > 64);
5072# if 0 /* rounding does not seem to help */
5073 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5074 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5075 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5076 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5077 {
5078 uint64_t uOld = r128.v[0];
5079 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5080 if (r128.v[0] < uOld)
5081 r128.v[1] += 1;
5082 }
5083# else
5084 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5085# endif
5086 return r128;
5087}
5088
5089
5090DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5091{
5092 RT_NOREF(fFcw);
5093 Assert(cBits > 64);
5094# if 0 /* rounding does not seem to help, not even on constants */
5095 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5096 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5097 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5098 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5099 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5100 {
5101 uint64_t uOld = r128.v[0];
5102 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5103 if (r128.v[0] < uOld)
5104 r128.v[1] += 1;
5105 }
5106 return r128;
5107# else
5108 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5109 return r128;
5110# endif
5111}
5112
5113
5114# if 0 /* unused */
5115DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5116{
5117 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5118 return r128;
5119}
5120# endif
5121
5122
5123/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5124DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5125{
5126 extFloat80_t Tmp;
5127 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5128 Tmp.signif = pr80Val->s2.uMantissa;
5129 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5130 return extF80_to_f128(Tmp, &Ignored);
5131}
5132
5133
5134/**
5135 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5136 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5137 *
5138 * This is only a structure format conversion, nothing else.
5139 */
5140DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5141{
5142 extFloat80_t Tmp;
5143 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5144 Tmp.signif = pr80Val->s2.uMantissa;
5145 return Tmp;
5146}
5147
5148
5149/**
5150 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5151 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5152 *
5153 * This is only a structure format conversion, nothing else.
5154 */
5155DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5156{
5157 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5158 pr80Dst->s2.uMantissa = r80XSrc.signif;
5159 return pr80Dst;
5160}
5161
5162
5163DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5164{
5165 RT_NOREF(fFcw);
5166 RTFLOAT128U Tmp;
5167 *(float128_t *)&Tmp = r128Src;
5168 ASMCompilerBarrier();
5169
5170 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5171 {
5172 pr80Dst->s.fSign = Tmp.s64.fSign;
5173 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5174 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5175 | Tmp.s64.uFractionLo >> (64 - 15);
5176
5177 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5178 unsigned const cShiftOff = 64 - 15;
5179 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5180 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5181 if (uRoundedOff)
5182 {
5183 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5184 ? RT_BIT_64(cShiftOff - 1)
5185 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5186 ? fRoundingOffMask
5187 : 0;
5188 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5189 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5190 || uRoundedOff != uRoundingAdd)
5191 {
5192 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5193 {
5194 uFraction += 1;
5195 if (!(uFraction & RT_BIT_64(63)))
5196 { /* likely */ }
5197 else
5198 {
5199 uFraction >>= 1;
5200 pr80Dst->s.uExponent++;
5201 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5202 return fFsw;
5203 }
5204 fFsw |= X86_FSW_C1;
5205 }
5206 }
5207 fFsw |= X86_FSW_PE;
5208 if (!(fFcw & X86_FCW_PM))
5209 fFsw |= X86_FSW_ES | X86_FSW_B;
5210 }
5211
5212 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5213 }
5214 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5215 {
5216 pr80Dst->s.fSign = Tmp.s64.fSign;
5217 pr80Dst->s.uExponent = 0;
5218 pr80Dst->s.uMantissa = 0;
5219 }
5220 else if (RTFLOAT128U_IS_INF(&Tmp))
5221 {
5222 pr80Dst->s.fSign = Tmp.s64.fSign;
5223 pr80Dst->s.uExponent = 0;
5224 pr80Dst->s.uMantissa = 0;
5225 }
5226 return fFsw;
5227}
5228
5229
5230/**
5231 * Helper for transfering exception and C1 to FSW and setting the result value
5232 * accordingly.
5233 *
5234 * @returns Updated FSW.
5235 * @param pSoftState The SoftFloat state following the operation.
5236 * @param r80XResult The result of the SoftFloat operation.
5237 * @param pr80Result Where to store the result for IEM.
5238 * @param fFcw The FPU control word.
5239 * @param fFsw The FSW before the operation, with necessary bits
5240 * cleared and such.
5241 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5242 * raised.
5243 */
5244DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5245 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5246 PCRTFLOAT80U pr80XcptResult)
5247{
5248 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5249 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5250 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5251 fFsw |= X86_FSW_ES | X86_FSW_B;
5252
5253 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5254 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5255 else
5256 {
5257 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5258 *pr80Result = *pr80XcptResult;
5259 }
5260 return fFsw;
5261}
5262
5263
5264/**
5265 * Helper doing polynomial evaluation using Horner's method.
5266 *
5267 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5268 */
5269float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5270 unsigned cPrecision, softfloat_state_t *pSoftState)
5271{
5272 Assert(cHornerConsts > 1);
5273 size_t i = cHornerConsts - 1;
5274 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5275 while (i-- > 0)
5276 {
5277 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5278 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5279 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5280 }
5281 return r128Result;
5282}
5283
5284#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5285
5286
5287/**
5288 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5289 * mantissa, exponent and sign.
5290 *
5291 * @returns Updated FSW.
5292 * @param pr80Dst Where to return the composed value.
5293 * @param fSign The sign.
5294 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5295 * ignored and should be zero. This will probably be
5296 * modified during normalization and rounding.
5297 * @param iExponent Unbiased exponent.
5298 * @param fFcw The FPU control word.
5299 * @param fFsw The FPU status word.
5300 */
5301static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5302 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5303{
5304 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5305
5306 iExponent += RTFLOAT80U_EXP_BIAS;
5307
5308 /* Do normalization if necessary and possible. */
5309 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5310 {
5311 int cShift = 192 - RTUInt256BitCount(puMantissa);
5312 if (iExponent > cShift)
5313 iExponent -= cShift;
5314 else
5315 {
5316 if (fFcw & X86_FCW_UM)
5317 {
5318 if (iExponent > 0)
5319 cShift = --iExponent;
5320 else
5321 cShift = 0;
5322 }
5323 iExponent -= cShift;
5324 }
5325 RTUInt256AssignShiftLeft(puMantissa, cShift);
5326 }
5327
5328 /* Do rounding. */
5329 uint64_t uMantissa = puMantissa->QWords.qw2;
5330 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5331 {
5332 bool fAdd;
5333 switch (fFcw & X86_FCW_RC_MASK)
5334 {
5335 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5336 case X86_FCW_RC_NEAREST:
5337 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5338 {
5339 if ( (uMantissa & 1)
5340 || puMantissa->QWords.qw0 != 0
5341 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5342 {
5343 fAdd = true;
5344 break;
5345 }
5346 uMantissa &= ~(uint64_t)1;
5347 }
5348 fAdd = false;
5349 break;
5350 case X86_FCW_RC_ZERO:
5351 fAdd = false;
5352 break;
5353 case X86_FCW_RC_UP:
5354 fAdd = !fSign;
5355 break;
5356 case X86_FCW_RC_DOWN:
5357 fAdd = fSign;
5358 break;
5359 }
5360 if (fAdd)
5361 {
5362 uint64_t const uTmp = uMantissa;
5363 uMantissa = uTmp + 1;
5364 if (uMantissa < uTmp)
5365 {
5366 uMantissa >>= 1;
5367 uMantissa |= RT_BIT_64(63);
5368 iExponent++;
5369 }
5370 fFsw |= X86_FSW_C1;
5371 }
5372 fFsw |= X86_FSW_PE;
5373 if (!(fFcw & X86_FCW_PM))
5374 fFsw |= X86_FSW_ES | X86_FSW_B;
5375 }
5376
5377 /* Check for underflow (denormals). */
5378 if (iExponent <= 0)
5379 {
5380 if (fFcw & X86_FCW_UM)
5381 {
5382 if (uMantissa & RT_BIT_64(63))
5383 uMantissa >>= 1;
5384 iExponent = 0;
5385 }
5386 else
5387 {
5388 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5389 fFsw |= X86_FSW_ES | X86_FSW_B;
5390 }
5391 fFsw |= X86_FSW_UE;
5392 }
5393 /* Check for overflow */
5394 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5395 {
5396 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5397 }
5398
5399 /* Compose the result. */
5400 pr80Dst->s.uMantissa = uMantissa;
5401 pr80Dst->s.uExponent = iExponent;
5402 pr80Dst->s.fSign = fSign;
5403 return fFsw;
5404}
5405
5406
5407/**
5408 * See also iemAImpl_fld_r80_from_r32
5409 */
5410static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5411{
5412 uint16_t fFsw = 0;
5413 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5414 {
5415 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5416 pr80Dst->sj64.fInteger = 1;
5417 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5418 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5419 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5420 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5421 }
5422 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5423 {
5424 pr80Dst->s.fSign = pr32Val->s.fSign;
5425 pr80Dst->s.uExponent = 0;
5426 pr80Dst->s.uMantissa = 0;
5427 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5428 }
5429 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5430 {
5431 /* Subnormal -> normalized + X86_FSW_DE return. */
5432 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5433 pr80Dst->sj64.fInteger = 1;
5434 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5435 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5436 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5437 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5438 fFsw = X86_FSW_DE;
5439 }
5440 else if (RTFLOAT32U_IS_INF(pr32Val))
5441 {
5442 pr80Dst->s.fSign = pr32Val->s.fSign;
5443 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5444 pr80Dst->s.uMantissa = RT_BIT_64(63);
5445 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5446 }
5447 else
5448 {
5449 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5450 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5451 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5452 pr80Dst->sj64.fInteger = 1;
5453 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5454 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5455 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5456 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5457 }
5458 return fFsw;
5459}
5460
5461
5462/**
5463 * See also iemAImpl_fld_r80_from_r64
5464 */
5465static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5466{
5467 uint16_t fFsw = 0;
5468 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5469 {
5470 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5471 pr80Dst->sj64.fInteger = 1;
5472 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5473 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5474 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5475 }
5476 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5477 {
5478 pr80Dst->s.fSign = pr64Val->s.fSign;
5479 pr80Dst->s.uExponent = 0;
5480 pr80Dst->s.uMantissa = 0;
5481 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5482 }
5483 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5484 {
5485 /* Subnormal values gets normalized. */
5486 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5487 pr80Dst->sj64.fInteger = 1;
5488 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5489 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5490 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5491 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5492 fFsw = X86_FSW_DE;
5493 }
5494 else if (RTFLOAT64U_IS_INF(pr64Val))
5495 {
5496 pr80Dst->s.fSign = pr64Val->s.fSign;
5497 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5498 pr80Dst->s.uMantissa = RT_BIT_64(63);
5499 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5500 }
5501 else
5502 {
5503 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5504 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5505 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5506 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5507 pr80Dst->sj64.fInteger = 1;
5508 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5509 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5510 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5511 }
5512 return fFsw;
5513}
5514
5515
5516/**
5517 * See also EMIT_FILD.
5518 */
5519#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5520static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5521{ \
5522 if (iVal == 0) \
5523 { \
5524 pr80Dst->s.fSign = 0; \
5525 pr80Dst->s.uExponent = 0; \
5526 pr80Dst->s.uMantissa = 0; \
5527 } \
5528 else \
5529 { \
5530 if (iVal > 0) \
5531 pr80Dst->s.fSign = 0; \
5532 else \
5533 { \
5534 pr80Dst->s.fSign = 1; \
5535 iVal = -iVal; \
5536 } \
5537 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5538 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5539 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5540 } \
5541 return pr80Dst; \
5542}
5543EMIT_CONVERT_IXX_TO_R80(16)
5544EMIT_CONVERT_IXX_TO_R80(32)
5545//EMIT_CONVERT_IXX_TO_R80(64)
5546
5547/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5548#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5549IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5550{ \
5551 RTFLOAT80U r80Val2; \
5552 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5553 Assert(!fFsw || fFsw == X86_FSW_DE); \
5554 if (fFsw) \
5555 { \
5556 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5557 fFsw = 0; \
5558 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5559 { \
5560 pFpuRes->r80Result = *pr80Val1; \
5561 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5562 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5563 return; \
5564 } \
5565 } \
5566 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5567 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5568}
5569
5570/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5571#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5572IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5573{ \
5574 RTFLOAT80U r80Val2; \
5575 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5576 Assert(!fFsw || fFsw == X86_FSW_DE); \
5577 if (fFsw) \
5578 { \
5579 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5580 fFsw = 0; \
5581 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5582 { \
5583 pFpuRes->r80Result = *pr80Val1; \
5584 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5585 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5586 return; \
5587 } \
5588 } \
5589 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5590 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5591}
5592
5593/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5594#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5595IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5596{ \
5597 RTFLOAT80U r80Val2; \
5598 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5599 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5600}
5601
5602/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5603#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5604IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5605{ \
5606 RTFLOAT80U r80Val2; \
5607 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5608 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5609}
5610
5611
5612
5613/*********************************************************************************************************************************
5614* x86 FPU Division Operations *
5615*********************************************************************************************************************************/
5616
5617/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5618static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5619 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5620{
5621 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5622 {
5623 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5624 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5625 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5626 }
5627 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5628 { /* Div by zero. */
5629 if (fFcw & X86_FCW_ZM)
5630 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5631 else
5632 {
5633 *pr80Result = *pr80Val1Org;
5634 fFsw |= X86_FSW_ES | X86_FSW_B;
5635 }
5636 fFsw |= X86_FSW_ZE;
5637 }
5638 else
5639 { /* Invalid operand */
5640 if (fFcw & X86_FCW_IM)
5641 *pr80Result = g_r80Indefinite;
5642 else
5643 {
5644 *pr80Result = *pr80Val1Org;
5645 fFsw |= X86_FSW_ES | X86_FSW_B;
5646 }
5647 fFsw |= X86_FSW_IE;
5648 }
5649 return fFsw;
5650}
5651
5652
5653IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5654 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5655{
5656 uint16_t const fFcw = pFpuState->FCW;
5657 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5658
5659 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5660 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5661 {
5662 if (fFcw & X86_FCW_IM)
5663 pFpuRes->r80Result = g_r80Indefinite;
5664 else
5665 {
5666 pFpuRes->r80Result = *pr80Val1;
5667 fFsw |= X86_FSW_ES | X86_FSW_B;
5668 }
5669 fFsw |= X86_FSW_IE;
5670 }
5671 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5672 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5673 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5674 {
5675 if (fFcw & X86_FCW_DM)
5676 {
5677 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5679 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5680 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5681 }
5682 else
5683 {
5684 pFpuRes->r80Result = *pr80Val1;
5685 fFsw |= X86_FSW_ES | X86_FSW_B;
5686 }
5687 fFsw |= X86_FSW_DE;
5688 }
5689 /* SoftFloat can handle the rest: */
5690 else
5691 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5692
5693 pFpuRes->FSW = fFsw;
5694}
5695
5696
5697EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5699EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5700EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5701
5702
5703IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5704 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5705{
5706 uint16_t const fFcw = pFpuState->FCW;
5707 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5708
5709 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5710 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5711 {
5712 if (fFcw & X86_FCW_IM)
5713 pFpuRes->r80Result = g_r80Indefinite;
5714 else
5715 {
5716 pFpuRes->r80Result = *pr80Val1;
5717 fFsw |= X86_FSW_ES | X86_FSW_B;
5718 }
5719 fFsw |= X86_FSW_IE;
5720 }
5721 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5722 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5723 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5724 {
5725 if (fFcw & X86_FCW_DM)
5726 {
5727 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5729 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5730 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5731 }
5732 else
5733 {
5734 pFpuRes->r80Result = *pr80Val1;
5735 fFsw |= X86_FSW_ES | X86_FSW_B;
5736 }
5737 fFsw |= X86_FSW_DE;
5738 }
5739 /* SoftFloat can handle the rest: */
5740 else
5741 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5742
5743 pFpuRes->FSW = fFsw;
5744}
5745
5746
5747EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5749EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5750EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5751
5752
5753/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5754static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5755 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5756{
5757 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5758 {
5759 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5760 uint16_t fCxFlags = 0;
5761 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5762 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5763 &fCxFlags, &SoftState);
5764 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5765 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5766 if ( !(fFsw & X86_FSW_IE)
5767 && !RTFLOAT80U_IS_NAN(pr80Result)
5768 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5769 {
5770 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5771 fFsw |= fCxFlags & X86_FSW_C_MASK;
5772 }
5773 return fFsw;
5774 }
5775
5776 /* Invalid operand */
5777 if (fFcw & X86_FCW_IM)
5778 *pr80Result = g_r80Indefinite;
5779 else
5780 {
5781 *pr80Result = *pr80Val1Org;
5782 fFsw |= X86_FSW_ES | X86_FSW_B;
5783 }
5784 return fFsw | X86_FSW_IE;
5785}
5786
5787
5788static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5789 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5790{
5791 uint16_t const fFcw = pFpuState->FCW;
5792 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5793
5794 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5795 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5796 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5797 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5798 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5799 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5800 {
5801 if (fFcw & X86_FCW_IM)
5802 pFpuRes->r80Result = g_r80Indefinite;
5803 else
5804 {
5805 pFpuRes->r80Result = *pr80Val1;
5806 fFsw |= X86_FSW_ES | X86_FSW_B;
5807 }
5808 fFsw |= X86_FSW_IE;
5809 }
5810 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5811 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5812 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5813 {
5814 if (fFcw & X86_FCW_DM)
5815 {
5816 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5818 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5819 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5820 pr80Val1Org, fLegacyInstr);
5821 }
5822 else
5823 {
5824 pFpuRes->r80Result = *pr80Val1;
5825 fFsw |= X86_FSW_ES | X86_FSW_B;
5826 }
5827 fFsw |= X86_FSW_DE;
5828 }
5829 /* SoftFloat can handle the rest: */
5830 else
5831 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5832 pr80Val1, fLegacyInstr);
5833
5834 pFpuRes->FSW = fFsw;
5835}
5836
5837
5838IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5839 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5840{
5841 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5842}
5843
5844
5845IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5846 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5847{
5848 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5849}
5850
5851
5852/*********************************************************************************************************************************
5853* x87 FPU Multiplication Operations *
5854*********************************************************************************************************************************/
5855
5856/** Worker for iemAImpl_fmul_r80_by_r80. */
5857static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5858 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5859{
5860 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5861 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5862 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5863}
5864
5865
5866IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5867 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5868{
5869 uint16_t const fFcw = pFpuState->FCW;
5870 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5871
5872 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5873 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5874 {
5875 if (fFcw & X86_FCW_IM)
5876 pFpuRes->r80Result = g_r80Indefinite;
5877 else
5878 {
5879 pFpuRes->r80Result = *pr80Val1;
5880 fFsw |= X86_FSW_ES | X86_FSW_B;
5881 }
5882 fFsw |= X86_FSW_IE;
5883 }
5884 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5885 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5886 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5887 {
5888 if (fFcw & X86_FCW_DM)
5889 {
5890 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5892 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5893 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5894 }
5895 else
5896 {
5897 pFpuRes->r80Result = *pr80Val1;
5898 fFsw |= X86_FSW_ES | X86_FSW_B;
5899 }
5900 fFsw |= X86_FSW_DE;
5901 }
5902 /* SoftFloat can handle the rest: */
5903 else
5904 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5905
5906 pFpuRes->FSW = fFsw;
5907}
5908
5909
5910EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5912EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5913EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5914
5915
5916/*********************************************************************************************************************************
5917* x87 FPU Addition *
5918*********************************************************************************************************************************/
5919
5920/** Worker for iemAImpl_fadd_r80_by_r80. */
5921static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5922 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5923{
5924 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5925 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5926 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5927}
5928
5929
5930IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5931 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5932{
5933 uint16_t const fFcw = pFpuState->FCW;
5934 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5935
5936 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5937 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5938 {
5939 if (fFcw & X86_FCW_IM)
5940 pFpuRes->r80Result = g_r80Indefinite;
5941 else
5942 {
5943 pFpuRes->r80Result = *pr80Val1;
5944 fFsw |= X86_FSW_ES | X86_FSW_B;
5945 }
5946 fFsw |= X86_FSW_IE;
5947 }
5948 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5949 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5950 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5951 {
5952 if (fFcw & X86_FCW_DM)
5953 {
5954 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5956 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5957 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5958 }
5959 else
5960 {
5961 pFpuRes->r80Result = *pr80Val1;
5962 fFsw |= X86_FSW_ES | X86_FSW_B;
5963 }
5964 fFsw |= X86_FSW_DE;
5965 }
5966 /* SoftFloat can handle the rest: */
5967 else
5968 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5969
5970 pFpuRes->FSW = fFsw;
5971}
5972
5973
5974EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5976EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5977EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5978
5979
5980/*********************************************************************************************************************************
5981* x87 FPU Subtraction *
5982*********************************************************************************************************************************/
5983
5984/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5985static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5986 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5987{
5988 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5989 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5990 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5991}
5992
5993
5994IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5995 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5996{
5997 uint16_t const fFcw = pFpuState->FCW;
5998 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5999
6000 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6001 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6002 {
6003 if (fFcw & X86_FCW_IM)
6004 pFpuRes->r80Result = g_r80Indefinite;
6005 else
6006 {
6007 pFpuRes->r80Result = *pr80Val1;
6008 fFsw |= X86_FSW_ES | X86_FSW_B;
6009 }
6010 fFsw |= X86_FSW_IE;
6011 }
6012 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6013 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6014 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6015 {
6016 if (fFcw & X86_FCW_DM)
6017 {
6018 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6020 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6021 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6022 }
6023 else
6024 {
6025 pFpuRes->r80Result = *pr80Val1;
6026 fFsw |= X86_FSW_ES | X86_FSW_B;
6027 }
6028 fFsw |= X86_FSW_DE;
6029 }
6030 /* SoftFloat can handle the rest: */
6031 else
6032 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6033
6034 pFpuRes->FSW = fFsw;
6035}
6036
6037
6038EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6040EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6041EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6042
6043
6044/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6045IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6046 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6047{
6048 uint16_t const fFcw = pFpuState->FCW;
6049 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6050
6051 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6052 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6053 {
6054 if (fFcw & X86_FCW_IM)
6055 pFpuRes->r80Result = g_r80Indefinite;
6056 else
6057 {
6058 pFpuRes->r80Result = *pr80Val1;
6059 fFsw |= X86_FSW_ES | X86_FSW_B;
6060 }
6061 fFsw |= X86_FSW_IE;
6062 }
6063 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6064 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6065 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6066 {
6067 if (fFcw & X86_FCW_DM)
6068 {
6069 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6071 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6072 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6073 }
6074 else
6075 {
6076 pFpuRes->r80Result = *pr80Val1;
6077 fFsw |= X86_FSW_ES | X86_FSW_B;
6078 }
6079 fFsw |= X86_FSW_DE;
6080 }
6081 /* SoftFloat can handle the rest: */
6082 else
6083 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6084
6085 pFpuRes->FSW = fFsw;
6086}
6087
6088
6089EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6091EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6092EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6093
6094
6095/*********************************************************************************************************************************
6096* x87 FPU Trigometric Operations *
6097*********************************************************************************************************************************/
6098
6099
6100IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6101 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6102{
6103 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6104 AssertReleaseFailed();
6105}
6106
6107#endif /* IEM_WITHOUT_ASSEMBLY */
6108
6109IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6110 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6111{
6112 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6113}
6114
6115IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6116 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6117{
6118 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6119}
6120
6121
6122#if defined(IEM_WITHOUT_ASSEMBLY)
6123IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6124{
6125 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6126 AssertReleaseFailed();
6127}
6128#endif /* IEM_WITHOUT_ASSEMBLY */
6129
6130IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6131{
6132 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6133}
6134
6135IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6136{
6137 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6138}
6139
6140
6141#ifdef IEM_WITHOUT_ASSEMBLY
6142IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6143{
6144 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6145 AssertReleaseFailed();
6146}
6147#endif /* IEM_WITHOUT_ASSEMBLY */
6148
6149IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6150{
6151 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6152}
6153
6154IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6155{
6156 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6157}
6158
6159#ifdef IEM_WITHOUT_ASSEMBLY
6160IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6161{
6162 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6163 AssertReleaseFailed();
6164}
6165#endif /* IEM_WITHOUT_ASSEMBLY */
6166
6167IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6168{
6169 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6170}
6171
6172IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6173{
6174 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6175}
6176
6177
6178#ifdef IEM_WITHOUT_ASSEMBLY
6179IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6180{
6181 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6182 AssertReleaseFailed();
6183}
6184#endif /* IEM_WITHOUT_ASSEMBLY */
6185
6186IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6187{
6188 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6189}
6190
6191IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6192{
6193 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6194}
6195
6196#ifdef IEM_WITHOUT_ASSEMBLY
6197
6198
6199/*********************************************************************************************************************************
6200* x87 FPU Compare and Testing Operations *
6201*********************************************************************************************************************************/
6202
6203IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6204{
6205 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6206
6207 if (RTFLOAT80U_IS_ZERO(pr80Val))
6208 fFsw |= X86_FSW_C3;
6209 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6210 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6211 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6212 {
6213 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6214 if (!(pFpuState->FCW & X86_FCW_DM))
6215 fFsw |= X86_FSW_ES | X86_FSW_B;
6216 }
6217 else
6218 {
6219 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6220 if (!(pFpuState->FCW & X86_FCW_IM))
6221 fFsw |= X86_FSW_ES | X86_FSW_B;
6222 }
6223
6224 *pu16Fsw = fFsw;
6225}
6226
6227
6228IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6229{
6230 RT_NOREF(pFpuState);
6231 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6232
6233 /* C1 = sign bit (always, even if empty Intel says). */
6234 if (pr80Val->s.fSign)
6235 fFsw |= X86_FSW_C1;
6236
6237 /* Classify the value in C0, C2, C3. */
6238 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6239 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6240 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6241 fFsw |= X86_FSW_C2;
6242 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6243 fFsw |= X86_FSW_C3;
6244 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6245 fFsw |= X86_FSW_C0;
6246 else if (RTFLOAT80U_IS_INF(pr80Val))
6247 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6248 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6249 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6250 /* whatever else: 0 */
6251
6252 *pu16Fsw = fFsw;
6253}
6254
6255
6256/**
6257 * Worker for fcom, fucom, and friends.
6258 */
6259static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6260 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6261{
6262 /*
6263 * Unpack the values.
6264 */
6265 bool const fSign1 = pr80Val1->s.fSign;
6266 int32_t iExponent1 = pr80Val1->s.uExponent;
6267 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6268
6269 bool const fSign2 = pr80Val2->s.fSign;
6270 int32_t iExponent2 = pr80Val2->s.uExponent;
6271 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6272
6273 /*
6274 * Check for invalid inputs.
6275 */
6276 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6277 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6278 {
6279 if (!(fFcw & X86_FCW_IM))
6280 fFsw |= X86_FSW_ES | X86_FSW_B;
6281 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6282 }
6283
6284 /*
6285 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6286 */
6287 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6288 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6289 {
6290 if ( fIeOnAllNaNs
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6292 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6293 {
6294 fFsw |= X86_FSW_IE;
6295 if (!(fFcw & X86_FCW_IM))
6296 fFsw |= X86_FSW_ES | X86_FSW_B;
6297 }
6298 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6299 }
6300
6301 /*
6302 * Normalize the values.
6303 */
6304 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6305 {
6306 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6307 iExponent1 = 1;
6308 else
6309 {
6310 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6311 uMantissa1 <<= iExponent1;
6312 iExponent1 = 1 - iExponent1;
6313 }
6314 fFsw |= X86_FSW_DE;
6315 if (!(fFcw & X86_FCW_DM))
6316 fFsw |= X86_FSW_ES | X86_FSW_B;
6317 }
6318
6319 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6320 {
6321 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6322 iExponent2 = 1;
6323 else
6324 {
6325 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6326 uMantissa2 <<= iExponent2;
6327 iExponent2 = 1 - iExponent2;
6328 }
6329 fFsw |= X86_FSW_DE;
6330 if (!(fFcw & X86_FCW_DM))
6331 fFsw |= X86_FSW_ES | X86_FSW_B;
6332 }
6333
6334 /*
6335 * Test if equal (val1 == val2):
6336 */
6337 if ( uMantissa1 == uMantissa2
6338 && iExponent1 == iExponent2
6339 && ( fSign1 == fSign2
6340 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6341 fFsw |= X86_FSW_C3;
6342 /*
6343 * Test if less than (val1 < val2):
6344 */
6345 else if (fSign1 && !fSign2)
6346 fFsw |= X86_FSW_C0;
6347 else if (fSign1 == fSign2)
6348 {
6349 /* Zeros are problematic, however at the most one can be zero here. */
6350 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6351 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6352 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6353 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6354
6355 if ( fSign1
6356 ^ ( iExponent1 < iExponent2
6357 || ( iExponent1 == iExponent2
6358 && uMantissa1 < uMantissa2 ) ) )
6359 fFsw |= X86_FSW_C0;
6360 }
6361 /* else: No flags set if greater. */
6362
6363 return fFsw;
6364}
6365
6366
6367IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6368 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6369{
6370 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6371}
6372
6373
6374
6375
6376IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6377 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6378{
6379 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6380}
6381
6382
6383IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6384 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6385{
6386 RTFLOAT80U r80Val2;
6387 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6388 Assert(!fFsw || fFsw == X86_FSW_DE);
6389 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6390 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6391 {
6392 if (!(pFpuState->FCW & X86_FCW_DM))
6393 fFsw |= X86_FSW_ES | X86_FSW_B;
6394 *pfFsw |= fFsw;
6395 }
6396}
6397
6398
6399IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6400 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6401{
6402 RTFLOAT80U r80Val2;
6403 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6404 Assert(!fFsw || fFsw == X86_FSW_DE);
6405 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6406 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6407 {
6408 if (!(pFpuState->FCW & X86_FCW_DM))
6409 fFsw |= X86_FSW_ES | X86_FSW_B;
6410 *pfFsw |= fFsw;
6411 }
6412}
6413
6414
6415IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6416 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6417{
6418 RTFLOAT80U r80Val2;
6419 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6420 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6421}
6422
6423
6424IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6425 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6426{
6427 RTFLOAT80U r80Val2;
6428 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6429 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6430}
6431
6432
6433/**
6434 * Worker for fcomi & fucomi.
6435 */
6436static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6437 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6438{
6439 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6440 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6441 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6442 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6443
6444 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6445 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6446 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6447}
6448
6449
6450IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6451 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6452{
6453 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6454}
6455
6456
6457IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6458 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6459{
6460 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6461}
6462
6463
6464/*********************************************************************************************************************************
6465* x87 FPU Other Operations *
6466*********************************************************************************************************************************/
6467
6468/**
6469 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6470 */
6471static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6472{
6473 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6474 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6475 true /*exact / generate #PE */, &SoftState));
6476 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6477}
6478
6479
6480IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6481{
6482 uint16_t const fFcw = pFpuState->FCW;
6483 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6484
6485 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6486 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6487 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6488 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6489 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6490 || RTFLOAT80U_IS_INF(pr80Val))
6491 pFpuRes->r80Result = *pr80Val;
6492 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6493 {
6494 fFsw |= X86_FSW_DE;
6495 if (fFcw & X86_FCW_DM)
6496 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6497 else
6498 {
6499 pFpuRes->r80Result = *pr80Val;
6500 fFsw |= X86_FSW_ES | X86_FSW_B;
6501 }
6502 }
6503 else
6504 {
6505 if (fFcw & X86_FCW_IM)
6506 {
6507 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6508 pFpuRes->r80Result = g_r80Indefinite;
6509 else
6510 {
6511 pFpuRes->r80Result = *pr80Val;
6512 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6513 }
6514 }
6515 else
6516 {
6517 pFpuRes->r80Result = *pr80Val;
6518 fFsw |= X86_FSW_ES | X86_FSW_B;
6519 }
6520 fFsw |= X86_FSW_IE;
6521 }
6522 pFpuRes->FSW = fFsw;
6523}
6524
6525
6526IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6528{
6529 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6530 it does everything we need it to do. */
6531 uint16_t const fFcw = pFpuState->FCW;
6532 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6533 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6534 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6535 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6536}
6537
6538
6539/**
6540 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6541 */
6542static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6543{
6544 Assert(!pr80Val->s.fSign);
6545 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6546 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6547 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6548}
6549
6550
6551IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6552{
6553 uint16_t const fFcw = pFpuState->FCW;
6554 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6555
6556 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6557 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6558 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6559 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6560 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6561 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6562 pFpuRes->r80Result = *pr80Val;
6563 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6564 {
6565 fFsw |= X86_FSW_DE;
6566 if (fFcw & X86_FCW_DM)
6567 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6568 else
6569 {
6570 pFpuRes->r80Result = *pr80Val;
6571 fFsw |= X86_FSW_ES | X86_FSW_B;
6572 }
6573 }
6574 else
6575 {
6576 if (fFcw & X86_FCW_IM)
6577 {
6578 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6579 pFpuRes->r80Result = g_r80Indefinite;
6580 else
6581 {
6582 pFpuRes->r80Result = *pr80Val;
6583 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6584 }
6585 }
6586 else
6587 {
6588 pFpuRes->r80Result = *pr80Val;
6589 fFsw |= X86_FSW_ES | X86_FSW_B;
6590 }
6591 fFsw |= X86_FSW_IE;
6592 }
6593 pFpuRes->FSW = fFsw;
6594}
6595
6596
6597/**
6598 * @code{.unparsed}
6599 * x x * ln2
6600 * f(x) = 2 - 1 = e - 1
6601 *
6602 * @endcode
6603 *
6604 * We can approximate e^x by a Taylor/Maclaurin series (see
6605 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6606 * @code{.unparsed}
6607 * n 0 1 2 3 4
6608 * inf x x x x x x
6609 * SUM ----- = --- + --- + --- + --- + --- + ...
6610 * n=0 n! 0! 1! 2! 3! 4!
6611 *
6612 * 2 3 4
6613 * x x x
6614 * = 1 + x + --- + --- + --- + ...
6615 * 2! 3! 4!
6616 * @endcode
6617 *
6618 * Given z = x * ln2, we get:
6619 * @code{.unparsed}
6620 * 2 3 4 n
6621 * z z z z z
6622 * e - 1 = z + --- + --- + --- + ... + ---
6623 * 2! 3! 4! n!
6624 * @endcode
6625 *
6626 * Wanting to use Horner's method, we move one z outside and get:
6627 * @code{.unparsed}
6628 * 2 3 (n-1)
6629 * z z z z
6630 * = z ( 1 + --- + --- + --- + ... + ------- )
6631 * 2! 3! 4! n!
6632 * @endcode
6633 *
6634 * The constants we need for using Horner's methods are 1 and 1 / n!.
6635 *
6636 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6637 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6638 * and can approximate it to be 1.0. For a visual demonstration of this
6639 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6640 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6641 *
6642 *
6643 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6644 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6645 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6646 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6647 * blocks). (The one bit difference is probably an implicit one missing from
6648 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6649 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6650 * exponent.
6651 *
6652 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6653 * successfully reproduced the exact results from an Intel 10980XE, there is
6654 * always a portition of rounding differences. Not going to spend too much time
6655 * on getting this 100% the same, at least not now.
6656 *
6657 * P.S. If someone are really curious about 8087 and its contstants:
6658 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6659 *
6660 *
6661 * @param pr80Val The exponent value (x), less than 1.0, greater than
6662 * -1.0 and not zero. This can be a normal, denormal
6663 * or pseudo-denormal value.
6664 * @param pr80Result Where to return the result.
6665 * @param fFcw FPU control word.
6666 * @param fFsw FPU status word.
6667 */
6668static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6669{
6670 /* As mentioned above, we can skip the expensive polynomial calculation
6671 as it will be close enough to 1.0 that it makes no difference.
6672
6673 The cutoff point for intel 10980XE is exponents >= -69. Intel
6674 also seems to be using a 67-bit or 68-bit constant value, and we get
6675 a smattering of rounding differences if we go for higher precision. */
6676 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6677 {
6678 RTUINT256U u256;
6679 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6680 u256.QWords.qw0 |= 1; /* force #PE */
6681 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6682 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6683 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6684 : 1 - RTFLOAT80U_EXP_BIAS,
6685 fFcw, fFsw);
6686 }
6687 else
6688 {
6689#ifdef IEM_WITH_FLOAT128_FOR_FPU
6690 /* This approach is not good enough for small values, we end up with zero. */
6691 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6692 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6693 _Float128 rd128Result = powf128(2.0L, rd128Val);
6694 rd128Result -= 1.0L;
6695 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6696 iemFpuF128RestoreRounding(fOldRounding);
6697
6698# else
6699 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6700 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6701
6702 /* As mentioned above, enforce 68-bit internal mantissa width to better
6703 match the Intel 10980XE results. */
6704 unsigned const cPrecision = 68;
6705
6706 /* first calculate z = x * ln2 */
6707 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6708 cPrecision);
6709
6710 /* Then do the polynomial evaluation. */
6711 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6712 cPrecision, &SoftState);
6713 r = f128_mul(z, r, &SoftState);
6714
6715 /* Output the result. */
6716 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6717# endif
6718 }
6719 return fFsw;
6720}
6721
6722
6723IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6724{
6725 uint16_t const fFcw = pFpuState->FCW;
6726 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6727
6728 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6729 {
6730 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6731 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6732 else
6733 {
6734 /* Special case:
6735 2^+1.0 - 1.0 = 1.0
6736 2^-1.0 - 1.0 = -0.5 */
6737 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6738 && pr80Val->s.uMantissa == RT_BIT_64(63))
6739 {
6740 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6741 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6742 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6743 }
6744 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6745 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6746 else
6747 pFpuRes->r80Result = *pr80Val;
6748 fFsw |= X86_FSW_PE;
6749 if (!(fFcw & X86_FCW_PM))
6750 fFsw |= X86_FSW_ES | X86_FSW_B;
6751 }
6752 }
6753 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6754 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6755 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6756 pFpuRes->r80Result = *pr80Val;
6757 else if (RTFLOAT80U_IS_INF(pr80Val))
6758 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6759 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6760 {
6761 fFsw |= X86_FSW_DE;
6762 if (fFcw & X86_FCW_DM)
6763 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6764 else
6765 {
6766 pFpuRes->r80Result = *pr80Val;
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 }
6770 else
6771 {
6772 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6773 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6774 && (fFcw & X86_FCW_IM))
6775 pFpuRes->r80Result = g_r80Indefinite;
6776 else
6777 {
6778 pFpuRes->r80Result = *pr80Val;
6779 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6780 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6781 }
6782 fFsw |= X86_FSW_IE;
6783 if (!(fFcw & X86_FCW_IM))
6784 fFsw |= X86_FSW_ES | X86_FSW_B;
6785 }
6786 pFpuRes->FSW = fFsw;
6787}
6788
6789#endif /* IEM_WITHOUT_ASSEMBLY */
6790
6791IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6792{
6793 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6794}
6795
6796IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6797{
6798 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6799}
6800
6801#ifdef IEM_WITHOUT_ASSEMBLY
6802
6803IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6804{
6805 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6806 pFpuRes->r80Result = *pr80Val;
6807 pFpuRes->r80Result.s.fSign = 0;
6808}
6809
6810
6811IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6812{
6813 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6814 pFpuRes->r80Result = *pr80Val;
6815 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6816}
6817
6818
6819IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6820{
6821 uint16_t const fFcw = pFpuState->FCW;
6822 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6823
6824 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6825 {
6826 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6827 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6828
6829 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6830 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6831 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6832 }
6833 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6834 {
6835 fFsw |= X86_FSW_ZE;
6836 if (fFcw & X86_FCW_ZM)
6837 {
6838 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6839 pFpuResTwo->r80Result2 = *pr80Val;
6840 }
6841 else
6842 {
6843 pFpuResTwo->r80Result2 = *pr80Val;
6844 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6845 }
6846 }
6847 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6848 {
6849 fFsw |= X86_FSW_DE;
6850 if (fFcw & X86_FCW_DM)
6851 {
6852 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6853 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6854 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6855 int32_t iExponent = -16382;
6856 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6857 {
6858 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6859 iExponent--;
6860 }
6861
6862 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6863 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6864 }
6865 else
6866 {
6867 pFpuResTwo->r80Result2 = *pr80Val;
6868 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6869 }
6870 }
6871 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6872 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6873 {
6874 pFpuResTwo->r80Result1 = *pr80Val;
6875 pFpuResTwo->r80Result2 = *pr80Val;
6876 }
6877 else if (RTFLOAT80U_IS_INF(pr80Val))
6878 {
6879 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6880 pFpuResTwo->r80Result2 = *pr80Val;
6881 }
6882 else
6883 {
6884 if (fFcw & X86_FCW_IM)
6885 {
6886 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6887 pFpuResTwo->r80Result1 = g_r80Indefinite;
6888 else
6889 {
6890 pFpuResTwo->r80Result1 = *pr80Val;
6891 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6892 }
6893 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6894 }
6895 else
6896 {
6897 pFpuResTwo->r80Result2 = *pr80Val;
6898 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6899 }
6900 fFsw |= X86_FSW_IE;
6901 }
6902 pFpuResTwo->FSW = fFsw;
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6908{
6909 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6910 AssertReleaseFailed();
6911}
6912
6913#endif /* IEM_WITHOUT_ASSEMBLY */
6914
6915IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6916 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6917{
6918 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6919}
6920
6921IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6922 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6923{
6924 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6925}
6926
6927#if defined(IEM_WITHOUT_ASSEMBLY)
6928
6929IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6931{
6932 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6933 AssertReleaseFailed();
6934}
6935
6936#endif /* IEM_WITHOUT_ASSEMBLY */
6937
6938IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6939 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6940{
6941 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6942}
6943
6944IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6945 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6946{
6947 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6948}
6949
6950
6951/*********************************************************************************************************************************
6952* MMX, SSE & AVX *
6953*********************************************************************************************************************************/
6954
6955/*
6956 * MOVSLDUP / VMOVSLDUP
6957 */
6958IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
6959{
6960 puDst->au32[0] = puSrc->au32[0];
6961 puDst->au32[1] = puSrc->au32[0];
6962 puDst->au32[2] = puSrc->au32[2];
6963 puDst->au32[3] = puSrc->au32[2];
6964}
6965
6966#ifdef IEM_WITH_VEX
6967
6968IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6969{
6970 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6971 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6972 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6973 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6975 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6976 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6977 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6978}
6979
6980
6981IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6982{
6983 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6984 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6985 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6986 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6988 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6989 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6990 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6991}
6992
6993#endif /* IEM_WITH_VEX */
6994
6995
6996/*
6997 * MOVSHDUP / VMOVSHDUP
6998 */
6999IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7000{
7001 puDst->au32[0] = puSrc->au32[1];
7002 puDst->au32[1] = puSrc->au32[1];
7003 puDst->au32[2] = puSrc->au32[3];
7004 puDst->au32[3] = puSrc->au32[3];
7005}
7006
7007#ifdef IEM_WITH_VEX
7008
7009IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7010{
7011 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7012 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7013 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7014 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7015 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7016 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7018 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7019}
7020
7021
7022IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7023{
7024 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7025 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7026 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7027 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7028 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7029 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7030 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7031 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7032}
7033
7034#endif /* IEM_WITH_VEX */
7035
7036
7037/*
7038 * MOVDDUP / VMOVDDUP
7039 */
7040IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7041{
7042 puDst->au64[0] = uSrc;
7043 puDst->au64[1] = uSrc;
7044}
7045
7046#ifdef IEM_WITH_VEX
7047
7048IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7049{
7050 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7051 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7052 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7053 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7054}
7055
7056IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7057{
7058 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7059 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7060 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7061 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7062}
7063
7064#endif /* IEM_WITH_VEX */
7065
7066
7067/*
7068 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7069 */
7070#ifdef IEM_WITHOUT_ASSEMBLY
7071
7072IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7073{
7074 RT_NOREF(pFpuState);
7075 *puDst &= *puSrc;
7076}
7077
7078
7079IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7080{
7081 RT_NOREF(pFpuState);
7082 puDst->au64[0] &= puSrc->au64[0];
7083 puDst->au64[1] &= puSrc->au64[1];
7084}
7085
7086#endif
7087
7088IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7089 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7090{
7091 RT_NOREF(pExtState);
7092 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7093 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7094}
7095
7096
7097IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7098 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7099{
7100 RT_NOREF(pExtState);
7101 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7102 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7103 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7104 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7105}
7106
7107
7108/*
7109 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7110 */
7111#ifdef IEM_WITHOUT_ASSEMBLY
7112
7113IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7114{
7115 RT_NOREF(pFpuState);
7116 *puDst = ~*puDst & *puSrc;
7117}
7118
7119
7120IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7121{
7122 RT_NOREF(pFpuState);
7123 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7124 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7125}
7126
7127#endif
7128
7129IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7130 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7131{
7132 RT_NOREF(pExtState);
7133 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7134 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7135}
7136
7137
7138IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7139 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7140{
7141 RT_NOREF(pExtState);
7142 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7143 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7144 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7145 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7146}
7147
7148
7149/*
7150 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7151 */
7152#ifdef IEM_WITHOUT_ASSEMBLY
7153
7154IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7155{
7156 RT_NOREF(pFpuState);
7157 *puDst |= *puSrc;
7158}
7159
7160
7161IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7162{
7163 RT_NOREF(pFpuState);
7164 puDst->au64[0] |= puSrc->au64[0];
7165 puDst->au64[1] |= puSrc->au64[1];
7166}
7167
7168#endif
7169
7170IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7171 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7172{
7173 RT_NOREF(pExtState);
7174 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7175 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7176}
7177
7178
7179IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7180 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7181{
7182 RT_NOREF(pExtState);
7183 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7184 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7185 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7186 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7187}
7188
7189
7190/*
7191 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7192 */
7193#ifdef IEM_WITHOUT_ASSEMBLY
7194
7195IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7196{
7197 RT_NOREF(pFpuState);
7198 *puDst ^= *puSrc;
7199}
7200
7201
7202IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7203{
7204 RT_NOREF(pFpuState);
7205 puDst->au64[0] ^= puSrc->au64[0];
7206 puDst->au64[1] ^= puSrc->au64[1];
7207}
7208
7209#endif
7210
7211IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7212 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7213{
7214 RT_NOREF(pExtState);
7215 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7216 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7217}
7218
7219
7220IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7221 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7222{
7223 RT_NOREF(pExtState);
7224 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7225 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7226 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7227 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7228}
7229
7230
7231/*
7232 * PCMPEQB / VPCMPEQB
7233 */
7234#ifdef IEM_WITHOUT_ASSEMBLY
7235
7236IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7237{
7238 RT_NOREF(pFpuState);
7239 RTUINT64U uSrc1 = { *puDst };
7240 RTUINT64U uSrc2 = { *puSrc };
7241 RTUINT64U uDst;
7242 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7243 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7244 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7245 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7246 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7247 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7248 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7249 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7250 *puDst = uDst.u;
7251}
7252
7253
7254IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7255{
7256 RT_NOREF(pFpuState);
7257 RTUINT128U uSrc1 = *puDst;
7258 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7259 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7260 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7261 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7262 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7263 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7264 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7265 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7266 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7267 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7268 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7269 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7270 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7271 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7272 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7273 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7274}
7275
7276#endif
7277
7278IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7279 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7280{
7281 RT_NOREF(pExtState);
7282 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7283 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7284 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7285 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7286 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7287 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7288 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7289 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7290 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7291 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7292 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7293 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7294 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7295 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7296 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7297 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7298}
7299
7300IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7301 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7302{
7303 RT_NOREF(pExtState);
7304 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7305 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7306 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7307 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7308 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7309 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7310 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7311 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7312 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7313 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7314 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7315 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7316 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7317 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7318 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7319 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7320 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7321 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7322 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7323 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7324 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7325 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7326 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7327 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7328 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7329 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7330 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7331 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7332 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7333 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7334 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7335 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7336}
7337
7338
7339/*
7340 * PCMPEQW / VPCMPEQW
7341 */
7342#ifdef IEM_WITHOUT_ASSEMBLY
7343
7344IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7345{
7346 RT_NOREF(pFpuState);
7347 RTUINT64U uSrc1 = { *puDst };
7348 RTUINT64U uSrc2 = { *puSrc };
7349 RTUINT64U uDst;
7350 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7351 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7352 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7353 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7354 *puDst = uDst.u;
7355}
7356
7357
7358IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7359{
7360 RT_NOREF(pFpuState);
7361 RTUINT128U uSrc1 = *puDst;
7362 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7363 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7364 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7365 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7366 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7367 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7368 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7369 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7370}
7371
7372#endif
7373
7374IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7375 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7376{
7377 RT_NOREF(pExtState);
7378 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7379 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7380 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7381 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7382 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7383 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7384 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7385 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7386}
7387
7388IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7389 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7390{
7391 RT_NOREF(pExtState);
7392 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7393 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7394 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7395 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7396 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7397 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7398 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7399 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7400 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7401 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7402 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7403 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7404 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7405 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7406 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7407 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7408}
7409
7410
7411/*
7412 * PCMPEQD / VPCMPEQD.
7413 */
7414#ifdef IEM_WITHOUT_ASSEMBLY
7415
7416IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7417{
7418 RT_NOREF(pFpuState);
7419 RTUINT64U uSrc1 = { *puDst };
7420 RTUINT64U uSrc2 = { *puSrc };
7421 RTUINT64U uDst;
7422 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
7423 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
7424 *puDst = uDst.u;
7425}
7426
7427
7428IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7429{
7430 RT_NOREF(pFpuState);
7431 RTUINT128U uSrc1 = *puDst;
7432 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
7433 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
7434 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
7435 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
7436}
7437
7438#endif /* IEM_WITHOUT_ASSEMBLY */
7439
7440IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7441 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7442{
7443 RT_NOREF(pExtState);
7444 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7445 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7446 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7447 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7448}
7449
7450IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7451 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7452{
7453 RT_NOREF(pExtState);
7454 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
7455 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
7456 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
7457 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
7458 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
7459 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
7460 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
7461 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
7462}
7463
7464
7465/*
7466 * PCMPEQQ / VPCMPEQQ.
7467 */
7468IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7469{
7470 RT_NOREF(pFpuState);
7471 RTUINT128U uSrc1 = *puDst;
7472 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
7473 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
7474}
7475
7476IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7477 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7478{
7479 RT_NOREF(pExtState);
7480 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7481 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7482}
7483
7484IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7485 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7486{
7487 RT_NOREF(pExtState);
7488 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
7489 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
7490 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
7491 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
7492}
7493
7494
7495/*
7496 * PCMPGTB / VPCMPGTB
7497 */
7498#ifdef IEM_WITHOUT_ASSEMBLY
7499
7500IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7501{
7502 RT_NOREF(pFpuState);
7503 RTUINT64U uSrc1 = { *puDst };
7504 RTUINT64U uSrc2 = { *puSrc };
7505 RTUINT64U uDst;
7506 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
7507 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
7508 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
7509 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
7510 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
7511 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
7512 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
7513 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
7514 *puDst = uDst.u;
7515}
7516
7517
7518IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7519{
7520 RT_NOREF(pFpuState);
7521 RTUINT128U uSrc1 = *puDst;
7522 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
7523 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
7524 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
7525 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
7526 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
7527 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
7528 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
7529 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
7530 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
7531 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
7532 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
7533 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
7534 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
7535 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
7536 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
7537 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
7538}
7539
7540#endif
7541
7542IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7543 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7544{
7545 RT_NOREF(pExtState);
7546 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7547 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7548 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7549 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7550 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7551 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7552 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7553 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7554 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7555 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7556 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7557 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7558 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7559 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7560 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7561 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7562}
7563
7564IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7565 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7566{
7567 RT_NOREF(pExtState);
7568 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
7569 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
7570 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
7571 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
7572 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
7573 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
7574 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
7575 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
7576 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
7577 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
7578 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
7579 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
7580 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
7581 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
7582 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
7583 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
7584 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
7585 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
7586 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
7587 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
7588 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
7589 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
7590 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
7591 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
7592 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
7593 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
7594 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
7595 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
7596 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
7597 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
7598 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
7599 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
7600}
7601
7602
7603/*
7604 * PCMPGTW / VPCMPGTW
7605 */
7606#ifdef IEM_WITHOUT_ASSEMBLY
7607
7608IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7609{
7610 RT_NOREF(pFpuState);
7611 RTUINT64U uSrc1 = { *puDst };
7612 RTUINT64U uSrc2 = { *puSrc };
7613 RTUINT64U uDst;
7614 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
7615 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
7616 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
7617 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
7618 *puDst = uDst.u;
7619}
7620
7621
7622IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7623{
7624 RT_NOREF(pFpuState);
7625 RTUINT128U uSrc1 = *puDst;
7626 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
7627 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
7628 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
7629 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
7630 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
7631 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
7632 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
7633 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
7634}
7635
7636#endif
7637
7638IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7639 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7640{
7641 RT_NOREF(pExtState);
7642 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7643 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7644 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7645 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7646 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7647 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7648 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7649 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7650}
7651
7652IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7653 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7654{
7655 RT_NOREF(pExtState);
7656 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
7657 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
7658 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
7659 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
7660 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
7661 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
7662 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
7663 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
7664 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
7665 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
7666 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
7667 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
7668 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
7669 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
7670 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
7671 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
7672}
7673
7674
7675/*
7676 * PCMPGTD / VPCMPGTD.
7677 */
7678#ifdef IEM_WITHOUT_ASSEMBLY
7679
7680IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7681{
7682 RT_NOREF(pFpuState);
7683 RTUINT64U uSrc1 = { *puDst };
7684 RTUINT64U uSrc2 = { *puSrc };
7685 RTUINT64U uDst;
7686 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
7687 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
7688 *puDst = uDst.u;
7689}
7690
7691
7692IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7693{
7694 RT_NOREF(pFpuState);
7695 RTUINT128U uSrc1 = *puDst;
7696 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
7697 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
7698 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
7699 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
7700}
7701
7702#endif /* IEM_WITHOUT_ASSEMBLY */
7703
7704IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7705 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7706{
7707 RT_NOREF(pExtState);
7708 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7709 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7710 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7711 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7712}
7713
7714IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7715 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7716{
7717 RT_NOREF(pExtState);
7718 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
7719 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
7720 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
7721 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
7722 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
7723 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
7724 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
7725 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
7726}
7727
7728
7729/*
7730 * PCMPGTQ / VPCMPGTQ.
7731 */
7732IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7733{
7734 RT_NOREF(pFpuState);
7735 RTUINT128U uSrc1 = *puDst;
7736 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
7737 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
7738}
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7745 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7746}
7747
7748IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7749 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7750{
7751 RT_NOREF(pExtState);
7752 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
7753 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
7754 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
7755 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
7756}
7757
7758
7759/*
7760 * PADDB / VPADDB
7761 */
7762#ifdef IEM_WITHOUT_ASSEMBLY
7763
7764IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7765{
7766 RT_NOREF(pFpuState);
7767 RTUINT64U uSrc1 = { *puDst };
7768 RTUINT64U uSrc2 = { *puSrc };
7769 RTUINT64U uDst;
7770 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
7771 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
7772 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
7773 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
7774 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
7775 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
7776 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
7777 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
7778 *puDst = uDst.u;
7779}
7780
7781
7782IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7783{
7784 RT_NOREF(pFpuState);
7785 RTUINT128U uSrc1 = *puDst;
7786 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
7787 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
7788 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
7789 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
7790 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
7791 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
7792 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
7793 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
7794 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
7795 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
7796 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
7797 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
7798 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
7799 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
7800 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
7801 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
7802}
7803
7804#endif
7805
7806
7807IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7808 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7809{
7810 RT_NOREF(pExtState);
7811 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7812 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7813 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7814 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7815 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7816 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7817 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7818 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7819 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7820 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7821 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7822 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7823 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7824 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7825 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7826 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7827}
7828
7829IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7830 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7831{
7832 RT_NOREF(pExtState);
7833 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
7834 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
7835 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
7836 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
7837 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
7838 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
7839 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
7840 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
7841 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
7842 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
7843 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
7844 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
7845 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
7846 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
7847 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
7848 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
7849 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
7850 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
7851 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
7852 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
7853 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
7854 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
7855 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
7856 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
7857 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
7858 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
7859 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
7860 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
7861 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
7862 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
7863 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
7864 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
7865}
7866
7867
7868/*
7869 * PADDSB / VPADDSB
7870 */
7871#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
7872 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
7873 ? (uint8_t)(a_iWord) \
7874 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
7875
7876#ifdef IEM_WITHOUT_ASSEMBLY
7877
7878IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7879{
7880 RT_NOREF(pFpuState);
7881 RTUINT64U uSrc1 = { *puDst };
7882 RTUINT64U uSrc2 = { *puSrc };
7883 RTUINT64U uDst;
7884 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
7885 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
7886 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
7887 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
7888 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
7889 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
7890 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
7891 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
7892 *puDst = uDst.u;
7893}
7894
7895
7896IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7897{
7898 RT_NOREF(pFpuState);
7899 RTUINT128U uSrc1 = *puDst;
7900 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
7901 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
7902 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
7903 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
7904 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
7905 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
7906 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
7907 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
7908 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
7909 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
7910 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
7911 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
7912 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
7913 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
7914 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
7915 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
7916}
7917
7918#endif
7919
7920
7921/*
7922 * PADDSB / VPADDSB
7923 */
7924#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
7925 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
7926 ? (uint8_t)(a_uWord) \
7927 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
7928
7929#ifdef IEM_WITHOUT_ASSEMBLY
7930
7931IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7932{
7933 RT_NOREF(pFpuState);
7934 RTUINT64U uSrc1 = { *puDst };
7935 RTUINT64U uSrc2 = { *puSrc };
7936 RTUINT64U uDst;
7937 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
7938 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
7939 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
7940 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
7941 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
7942 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
7943 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
7944 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
7945 *puDst = uDst.u;
7946}
7947
7948
7949IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7950{
7951 RT_NOREF(pFpuState);
7952 RTUINT128U uSrc1 = *puDst;
7953 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
7954 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
7955 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
7956 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
7957 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
7958 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
7959 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
7960 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
7961 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
7962 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
7963 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
7964 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
7965 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
7966 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
7967 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
7968 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
7969}
7970
7971#endif
7972
7973
7974/*
7975 * PADDW / VPADDW
7976 */
7977#ifdef IEM_WITHOUT_ASSEMBLY
7978
7979IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7980{
7981 RT_NOREF(pFpuState);
7982 RTUINT64U uSrc1 = { *puDst };
7983 RTUINT64U uSrc2 = { *puSrc };
7984 RTUINT64U uDst;
7985 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
7986 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
7987 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
7988 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
7989 *puDst = uDst.u;
7990}
7991
7992
7993IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7994{
7995 RT_NOREF(pFpuState);
7996 RTUINT128U uSrc1 = *puDst;
7997 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
7998 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
7999 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8000 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8001 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8002 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8003 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8004 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8005}
8006
8007#endif
8008
8009
8010IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8011 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8012{
8013 RT_NOREF(pExtState);
8014 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8015 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8016 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8017 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8018 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8019 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8020 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8021 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8022}
8023
8024IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8025 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8026{
8027 RT_NOREF(pExtState);
8028 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8029 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8030 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8031 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8032 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8033 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8034 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8035 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8036 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8037 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8038 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8039 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8040 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8041 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8042 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8043 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8044}
8045
8046
8047/*
8048 * PADDSW / VPADDSW
8049 */
8050#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8051 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8052 ? (uint16_t)(a_iDword) \
8053 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8054
8055#ifdef IEM_WITHOUT_ASSEMBLY
8056
8057IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8058{
8059 RT_NOREF(pFpuState);
8060 RTUINT64U uSrc1 = { *puDst };
8061 RTUINT64U uSrc2 = { *puSrc };
8062 RTUINT64U uDst;
8063 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8064 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8065 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8066 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8067 *puDst = uDst.u;
8068}
8069
8070
8071IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8072{
8073 RT_NOREF(pFpuState);
8074 RTUINT128U uSrc1 = *puDst;
8075 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8076 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8077 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8078 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8079 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8080 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8081 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8082 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8083}
8084
8085#endif
8086
8087
8088/*
8089 * PADDUSW / VPADDUSW
8090 */
8091#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8092 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8093 ? (uint16_t)(a_uDword) \
8094 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8095
8096#ifdef IEM_WITHOUT_ASSEMBLY
8097
8098IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8099{
8100 RT_NOREF(pFpuState);
8101 RTUINT64U uSrc1 = { *puDst };
8102 RTUINT64U uSrc2 = { *puSrc };
8103 RTUINT64U uDst;
8104 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8105 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8106 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8107 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8108 *puDst = uDst.u;
8109}
8110
8111
8112IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8113{
8114 RT_NOREF(pFpuState);
8115 RTUINT128U uSrc1 = *puDst;
8116 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8117 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8118 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8119 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8120 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8121 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8122 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8123 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8124}
8125
8126#endif
8127
8128
8129/*
8130 * PADDD / VPADDD.
8131 */
8132#ifdef IEM_WITHOUT_ASSEMBLY
8133
8134IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8135{
8136 RT_NOREF(pFpuState);
8137 RTUINT64U uSrc1 = { *puDst };
8138 RTUINT64U uSrc2 = { *puSrc };
8139 RTUINT64U uDst;
8140 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8141 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8142 *puDst = uDst.u;
8143}
8144
8145
8146IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8147{
8148 RT_NOREF(pFpuState);
8149 RTUINT128U uSrc1 = *puDst;
8150 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8151 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8152 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8153 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8154}
8155
8156#endif /* IEM_WITHOUT_ASSEMBLY */
8157
8158IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8159 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8160{
8161 RT_NOREF(pExtState);
8162 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8163 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8164 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8165 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8166}
8167
8168IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8169 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8170{
8171 RT_NOREF(pExtState);
8172 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8173 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8174 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8175 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8176 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8177 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8178 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8179 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8180}
8181
8182
8183/*
8184 * PADDQ / VPADDQ.
8185 */
8186#ifdef IEM_WITHOUT_ASSEMBLY
8187
8188IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8189{
8190 RT_NOREF(pFpuState);
8191 *puDst = *puDst + *puSrc;
8192}
8193
8194IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8195{
8196 RT_NOREF(pFpuState);
8197 RTUINT128U uSrc1 = *puDst;
8198 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8199 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8200}
8201
8202#endif
8203
8204IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8205 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8206{
8207 RT_NOREF(pExtState);
8208 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8209 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8210}
8211
8212IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8213 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8214{
8215 RT_NOREF(pExtState);
8216 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8217 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8218 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8219 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8220}
8221
8222
8223/*
8224 * PSUBB / VPSUBB
8225 */
8226#ifdef IEM_WITHOUT_ASSEMBLY
8227
8228IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8229{
8230 RT_NOREF(pFpuState);
8231 RTUINT64U uSrc1 = { *puDst };
8232 RTUINT64U uSrc2 = { *puSrc };
8233 RTUINT64U uDst;
8234 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8235 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8236 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8237 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8238 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8239 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8240 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8241 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8242 *puDst = uDst.u;
8243}
8244
8245
8246IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8247{
8248 RT_NOREF(pFpuState);
8249 RTUINT128U uSrc1 = *puDst;
8250 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8251 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8252 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8253 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8254 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8255 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8256 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8257 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8258 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8259 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8260 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8261 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8262 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8263 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8264 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8265 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8266}
8267
8268#endif
8269
8270IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8271 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8272{
8273 RT_NOREF(pExtState);
8274 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8275 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8276 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8277 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8278 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8279 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8280 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8281 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8282 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8283 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8284 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8285 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8286 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8287 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8288 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8289 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8290}
8291
8292IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8293 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8294{
8295 RT_NOREF(pExtState);
8296 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8297 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8298 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8299 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8300 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8301 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8302 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8303 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8304 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8305 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8306 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8307 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8308 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8309 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8310 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8311 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8312 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8313 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8314 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8315 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8316 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8317 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8318 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8319 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8320 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8321 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8322 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8323 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8324 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8325 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8326 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8327 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8328}
8329
8330
8331/*
8332 * PSUBSB / VSUBSB
8333 */
8334#ifdef IEM_WITHOUT_ASSEMBLY
8335
8336IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8337{
8338 RT_NOREF(pFpuState);
8339 RTUINT64U uSrc1 = { *puDst };
8340 RTUINT64U uSrc2 = { *puSrc };
8341 RTUINT64U uDst;
8342 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8343 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8344 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8345 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8346 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8347 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8348 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8349 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8350 *puDst = uDst.u;
8351}
8352
8353
8354IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8355{
8356 RT_NOREF(pFpuState);
8357 RTUINT128U uSrc1 = *puDst;
8358 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8359 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8360 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8361 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8362 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8363 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8364 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8365 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8366 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8367 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8368 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8369 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8370 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8371 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8372 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8373 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8374}
8375
8376#endif
8377
8378
8379/*
8380 * PADDSB / VPADDSB
8381 */
8382#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8383 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8384 ? (uint8_t)(a_uWord) \
8385 : (uint8_t)0 )
8386
8387#ifdef IEM_WITHOUT_ASSEMBLY
8388
8389IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8390{
8391 RT_NOREF(pFpuState);
8392 RTUINT64U uSrc1 = { *puDst };
8393 RTUINT64U uSrc2 = { *puSrc };
8394 RTUINT64U uDst;
8395 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8396 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8397 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8398 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8399 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8400 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8401 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8402 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8403 *puDst = uDst.u;
8404}
8405
8406
8407IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8408{
8409 RT_NOREF(pFpuState);
8410 RTUINT128U uSrc1 = *puDst;
8411 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
8412 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
8413 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
8414 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
8415 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
8416 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
8417 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
8418 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
8419 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
8420 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
8421 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
8422 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
8423 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
8424 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
8425 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
8426 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
8427}
8428
8429#endif
8430
8431
8432/*
8433 * PSUBW / VPSUBW
8434 */
8435#ifdef IEM_WITHOUT_ASSEMBLY
8436
8437IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8438{
8439 RT_NOREF(pFpuState);
8440 RTUINT64U uSrc1 = { *puDst };
8441 RTUINT64U uSrc2 = { *puSrc };
8442 RTUINT64U uDst;
8443 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
8444 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
8445 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
8446 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
8447 *puDst = uDst.u;
8448}
8449
8450
8451IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8452{
8453 RT_NOREF(pFpuState);
8454 RTUINT128U uSrc1 = *puDst;
8455 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
8456 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
8457 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
8458 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
8459 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
8460 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
8461 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
8462 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
8463}
8464
8465#endif
8466
8467IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8468 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8469{
8470 RT_NOREF(pExtState);
8471 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8472 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8473 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8474 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8475 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8476 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8477 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8478 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8479}
8480
8481IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8482 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8483{
8484 RT_NOREF(pExtState);
8485 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
8486 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
8487 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
8488 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
8489 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
8490 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
8491 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
8492 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
8493 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
8494 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
8495 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
8496 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
8497 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
8498 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
8499 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
8500 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
8501}
8502
8503
8504/*
8505 * PSUBSW / VPSUBSW
8506 */
8507#ifdef IEM_WITHOUT_ASSEMBLY
8508
8509IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8510{
8511 RT_NOREF(pFpuState);
8512 RTUINT64U uSrc1 = { *puDst };
8513 RTUINT64U uSrc2 = { *puSrc };
8514 RTUINT64U uDst;
8515 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
8516 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
8517 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
8518 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
8519 *puDst = uDst.u;
8520}
8521
8522
8523IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8524{
8525 RT_NOREF(pFpuState);
8526 RTUINT128U uSrc1 = *puDst;
8527 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
8528 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
8529 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
8530 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
8531 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
8532 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
8533 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
8534 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
8535}
8536
8537#endif
8538
8539
8540/*
8541 * PSUBUSW / VPSUBUSW
8542 */
8543#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
8544 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8545 ? (uint16_t)(a_uDword) \
8546 : (uint16_t)0 )
8547
8548#ifdef IEM_WITHOUT_ASSEMBLY
8549
8550IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8551{
8552 RT_NOREF(pFpuState);
8553 RTUINT64U uSrc1 = { *puDst };
8554 RTUINT64U uSrc2 = { *puSrc };
8555 RTUINT64U uDst;
8556 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
8557 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
8558 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
8559 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
8560 *puDst = uDst.u;
8561}
8562
8563
8564IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8565{
8566 RT_NOREF(pFpuState);
8567 RTUINT128U uSrc1 = *puDst;
8568 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
8569 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
8570 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
8571 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
8572 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
8573 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
8574 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
8575 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
8576}
8577
8578#endif
8579
8580
8581/*
8582 * PSUBD / VPSUBD.
8583 */
8584#ifdef IEM_WITHOUT_ASSEMBLY
8585
8586IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8587{
8588 RT_NOREF(pFpuState);
8589 RTUINT64U uSrc1 = { *puDst };
8590 RTUINT64U uSrc2 = { *puSrc };
8591 RTUINT64U uDst;
8592 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
8593 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
8594 *puDst = uDst.u;
8595}
8596
8597
8598IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8599{
8600 RT_NOREF(pFpuState);
8601 RTUINT128U uSrc1 = *puDst;
8602 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
8603 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
8604 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
8605 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
8606}
8607
8608#endif /* IEM_WITHOUT_ASSEMBLY */
8609
8610IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8611 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8612{
8613 RT_NOREF(pExtState);
8614 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8615 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8616 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8617 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8618}
8619
8620IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8621 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8622{
8623 RT_NOREF(pExtState);
8624 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
8625 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
8626 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
8627 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
8628 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
8629 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
8630 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
8631 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
8632}
8633
8634
8635/*
8636 * PSUBQ / VPSUBQ.
8637 */
8638#ifdef IEM_WITHOUT_ASSEMBLY
8639
8640IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8641{
8642 RT_NOREF(pFpuState);
8643 *puDst = *puDst - *puSrc;
8644}
8645
8646IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8647{
8648 RT_NOREF(pFpuState);
8649 RTUINT128U uSrc1 = *puDst;
8650 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
8651 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
8652}
8653
8654#endif
8655
8656IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8657 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8658{
8659 RT_NOREF(pExtState);
8660 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8661 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8662}
8663
8664IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8665 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8666{
8667 RT_NOREF(pExtState);
8668 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
8669 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
8670 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
8671 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
8672}
8673
8674
8675
8676/*
8677 * PMULLW / VPMULLW
8678 */
8679#ifdef IEM_WITHOUT_ASSEMBLY
8680
8681IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8682{
8683 RT_NOREF(pFpuState);
8684 RTUINT64U uSrc1 = { *puDst };
8685 RTUINT64U uSrc2 = { *puSrc };
8686 RTUINT64U uDst;
8687 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
8688 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
8689 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
8690 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
8691 *puDst = uDst.u;
8692}
8693
8694
8695IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8696{
8697 RT_NOREF(pFpuState);
8698 RTUINT128U uSrc1 = *puDst;
8699 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
8700 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
8701 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
8702 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
8703 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
8704 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
8705 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
8706 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
8707}
8708
8709#endif
8710
8711
8712/*
8713 * PMULHW / VPMULHW
8714 */
8715#ifdef IEM_WITHOUT_ASSEMBLY
8716
8717IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8718{
8719 RT_NOREF(pFpuState);
8720 RTUINT64U uSrc1 = { *puDst };
8721 RTUINT64U uSrc2 = { *puSrc };
8722 RTUINT64U uDst;
8723 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
8724 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
8725 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
8726 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
8727 *puDst = uDst.u;
8728}
8729
8730
8731IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8732{
8733 RT_NOREF(pFpuState);
8734 RTUINT128U uSrc1 = *puDst;
8735 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
8736 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
8737 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
8738 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
8739 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
8740 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
8741 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
8742 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
8743}
8744
8745#endif
8746
8747
8748/*
8749 * PSRLW / VPSRLW
8750 */
8751#ifdef IEM_WITHOUT_ASSEMBLY
8752
8753IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8754{
8755 RTUINT64U uSrc1 = { *puDst };
8756 RTUINT64U uSrc2 = { *puSrc };
8757 RTUINT64U uDst;
8758
8759 if (uSrc2.au64[0] <= 15)
8760 {
8761 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
8762 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
8763 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
8764 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
8765 }
8766 else
8767 {
8768 uDst.au64[0] = 0;
8769 }
8770 *puDst = uDst.u;
8771}
8772
8773
8774IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8775{
8776 RTUINT64U uSrc1 = { *puDst };
8777 RTUINT64U uDst;
8778
8779 if (uShift <= 15)
8780 {
8781 uDst.au16[0] = uSrc1.au16[0] >> uShift;
8782 uDst.au16[1] = uSrc1.au16[1] >> uShift;
8783 uDst.au16[2] = uSrc1.au16[2] >> uShift;
8784 uDst.au16[3] = uSrc1.au16[3] >> uShift;
8785 }
8786 else
8787 {
8788 uDst.au64[0] = 0;
8789 }
8790 *puDst = uDst.u;
8791}
8792
8793
8794IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8795{
8796 RTUINT128U uSrc1 = *puDst;
8797
8798 if (puSrc->au64[0] <= 15)
8799 {
8800 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
8801 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
8802 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
8803 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
8804 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
8805 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
8806 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
8807 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
8808 }
8809 else
8810 {
8811 puDst->au64[0] = 0;
8812 puDst->au64[1] = 0;
8813 }
8814}
8815
8816IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8817{
8818 RTUINT128U uSrc1 = *puDst;
8819
8820 if (uShift <= 15)
8821 {
8822 puDst->au16[0] = uSrc1.au16[0] >> uShift;
8823 puDst->au16[1] = uSrc1.au16[1] >> uShift;
8824 puDst->au16[2] = uSrc1.au16[2] >> uShift;
8825 puDst->au16[3] = uSrc1.au16[3] >> uShift;
8826 puDst->au16[4] = uSrc1.au16[4] >> uShift;
8827 puDst->au16[5] = uSrc1.au16[5] >> uShift;
8828 puDst->au16[6] = uSrc1.au16[6] >> uShift;
8829 puDst->au16[7] = uSrc1.au16[7] >> uShift;
8830 }
8831 else
8832 {
8833 puDst->au64[0] = 0;
8834 puDst->au64[1] = 0;
8835 }
8836}
8837
8838#endif
8839
8840
8841/*
8842 * PSRAW / VPSRAW
8843 */
8844#ifdef IEM_WITHOUT_ASSEMBLY
8845
8846IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8847{
8848 RTUINT64U uSrc1 = { *puDst };
8849 RTUINT64U uSrc2 = { *puSrc };
8850 RTUINT64U uDst;
8851
8852 if (uSrc2.au64[0] <= 15)
8853 {
8854 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
8855 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
8856 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
8857 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
8858 }
8859 else
8860 {
8861 uDst.au64[0] = 0;
8862 }
8863 *puDst = uDst.u;
8864}
8865
8866
8867IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8868{
8869 RTUINT64U uSrc1 = { *puDst };
8870 RTUINT64U uDst;
8871
8872 if (uShift <= 15)
8873 {
8874 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
8875 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
8876 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
8877 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
8878 }
8879 else
8880 {
8881 uDst.au64[0] = 0;
8882 }
8883 *puDst = uDst.u;
8884}
8885
8886
8887IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8888{
8889 RTUINT128U uSrc1 = *puDst;
8890
8891 if (puSrc->au64[0] <= 15)
8892 {
8893 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
8894 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
8895 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
8896 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
8897 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
8898 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
8899 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
8900 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
8901 }
8902 else
8903 {
8904 puDst->au64[0] = 0;
8905 puDst->au64[1] = 0;
8906 }
8907}
8908
8909IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
8910{
8911 RTUINT128U uSrc1 = *puDst;
8912
8913 if (uShift <= 15)
8914 {
8915 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
8916 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
8917 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
8918 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
8919 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
8920 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
8921 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
8922 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
8923 }
8924 else
8925 {
8926 puDst->au64[0] = 0;
8927 puDst->au64[1] = 0;
8928 }
8929}
8930
8931#endif
8932
8933
8934/*
8935 * PSLLW / VPSLLW
8936 */
8937#ifdef IEM_WITHOUT_ASSEMBLY
8938
8939IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
8940{
8941 RTUINT64U uSrc1 = { *puDst };
8942 RTUINT64U uSrc2 = { *puSrc };
8943 RTUINT64U uDst;
8944
8945 if (uSrc2.au64[0] <= 15)
8946 {
8947 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
8948 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
8949 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
8950 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
8951 }
8952 else
8953 {
8954 uDst.au64[0] = 0;
8955 }
8956 *puDst = uDst.u;
8957}
8958
8959
8960IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
8961{
8962 RTUINT64U uSrc1 = { *puDst };
8963 RTUINT64U uDst;
8964
8965 if (uShift <= 15)
8966 {
8967 uDst.au16[0] = uSrc1.au16[0] << uShift;
8968 uDst.au16[1] = uSrc1.au16[1] << uShift;
8969 uDst.au16[2] = uSrc1.au16[2] << uShift;
8970 uDst.au16[3] = uSrc1.au16[3] << uShift;
8971 }
8972 else
8973 {
8974 uDst.au64[0] = 0;
8975 }
8976 *puDst = uDst.u;
8977}
8978
8979
8980IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
8981{
8982 RTUINT128U uSrc1 = *puDst;
8983
8984 if (puSrc->au64[0] <= 15)
8985 {
8986 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
8987 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
8988 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
8989 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
8990 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
8991 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
8992 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
8993 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
8994 }
8995 else
8996 {
8997 puDst->au64[0] = 0;
8998 puDst->au64[1] = 0;
8999 }
9000}
9001
9002IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9003{
9004 RTUINT128U uSrc1 = *puDst;
9005
9006 if (uShift <= 15)
9007 {
9008 puDst->au16[0] = uSrc1.au16[0] << uShift;
9009 puDst->au16[1] = uSrc1.au16[1] << uShift;
9010 puDst->au16[2] = uSrc1.au16[2] << uShift;
9011 puDst->au16[3] = uSrc1.au16[3] << uShift;
9012 puDst->au16[4] = uSrc1.au16[4] << uShift;
9013 puDst->au16[5] = uSrc1.au16[5] << uShift;
9014 puDst->au16[6] = uSrc1.au16[6] << uShift;
9015 puDst->au16[7] = uSrc1.au16[7] << uShift;
9016 }
9017 else
9018 {
9019 puDst->au64[0] = 0;
9020 puDst->au64[1] = 0;
9021 }
9022}
9023
9024#endif
9025
9026
9027/*
9028 * PSRLD / VPSRLD
9029 */
9030#ifdef IEM_WITHOUT_ASSEMBLY
9031
9032IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9033{
9034 RTUINT64U uSrc1 = { *puDst };
9035 RTUINT64U uSrc2 = { *puSrc };
9036 RTUINT64U uDst;
9037
9038 if (uSrc2.au64[0] <= 31)
9039 {
9040 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9041 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9042 }
9043 else
9044 {
9045 uDst.au64[0] = 0;
9046 }
9047 *puDst = uDst.u;
9048}
9049
9050
9051IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9052{
9053 RTUINT64U uSrc1 = { *puDst };
9054 RTUINT64U uDst;
9055
9056 if (uShift <= 31)
9057 {
9058 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9059 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9060 }
9061 else
9062 {
9063 uDst.au64[0] = 0;
9064 }
9065 *puDst = uDst.u;
9066}
9067
9068
9069IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9070{
9071 RTUINT128U uSrc1 = *puDst;
9072
9073 if (puSrc->au64[0] <= 31)
9074 {
9075 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9076 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9077 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9078 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9079 }
9080 else
9081 {
9082 puDst->au64[0] = 0;
9083 puDst->au64[1] = 0;
9084 }
9085}
9086
9087IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9088{
9089 RTUINT128U uSrc1 = *puDst;
9090
9091 if (uShift <= 31)
9092 {
9093 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9094 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9095 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9096 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9097 }
9098 else
9099 {
9100 puDst->au64[0] = 0;
9101 puDst->au64[1] = 0;
9102 }
9103}
9104
9105#endif
9106
9107
9108/*
9109 * PSRAD / VPSRAD
9110 */
9111#ifdef IEM_WITHOUT_ASSEMBLY
9112
9113IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9114{
9115 RTUINT64U uSrc1 = { *puDst };
9116 RTUINT64U uSrc2 = { *puSrc };
9117 RTUINT64U uDst;
9118
9119 if (uSrc2.au64[0] <= 31)
9120 {
9121 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9122 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9123 }
9124 else
9125 {
9126 uDst.au64[0] = 0;
9127 }
9128 *puDst = uDst.u;
9129}
9130
9131
9132IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9133{
9134 RTUINT64U uSrc1 = { *puDst };
9135 RTUINT64U uDst;
9136
9137 if (uShift <= 31)
9138 {
9139 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9140 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9141 }
9142 else
9143 {
9144 uDst.au64[0] = 0;
9145 }
9146 *puDst = uDst.u;
9147}
9148
9149
9150IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9151{
9152 RTUINT128U uSrc1 = *puDst;
9153
9154 if (puSrc->au64[0] <= 31)
9155 {
9156 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9157 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9158 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9159 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9160 }
9161 else
9162 {
9163 puDst->au64[0] = 0;
9164 puDst->au64[1] = 0;
9165 }
9166}
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9169{
9170 RTUINT128U uSrc1 = *puDst;
9171
9172 if (uShift <= 31)
9173 {
9174 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9175 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9176 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9177 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9178 }
9179 else
9180 {
9181 puDst->au64[0] = 0;
9182 puDst->au64[1] = 0;
9183 }
9184}
9185
9186#endif
9187
9188
9189/*
9190 * PSLLD / VPSLLD
9191 */
9192#ifdef IEM_WITHOUT_ASSEMBLY
9193
9194IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9195{
9196 RTUINT64U uSrc1 = { *puDst };
9197 RTUINT64U uSrc2 = { *puSrc };
9198 RTUINT64U uDst;
9199
9200 if (uSrc2.au64[0] <= 31)
9201 {
9202 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9203 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9204 }
9205 else
9206 {
9207 uDst.au64[0] = 0;
9208 }
9209 *puDst = uDst.u;
9210}
9211
9212
9213IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9214{
9215 RTUINT64U uSrc1 = { *puDst };
9216 RTUINT64U uDst;
9217
9218 if (uShift <= 31)
9219 {
9220 uDst.au32[0] = uSrc1.au32[0] << uShift;
9221 uDst.au32[1] = uSrc1.au32[1] << uShift;
9222 }
9223 else
9224 {
9225 uDst.au64[0] = 0;
9226 }
9227 *puDst = uDst.u;
9228}
9229
9230
9231IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9232{
9233 RTUINT128U uSrc1 = *puDst;
9234
9235 if (puSrc->au64[0] <= 31)
9236 {
9237 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9238 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9239 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9240 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9241 }
9242 else
9243 {
9244 puDst->au64[0] = 0;
9245 puDst->au64[1] = 0;
9246 }
9247}
9248
9249IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9250{
9251 RTUINT128U uSrc1 = *puDst;
9252
9253 if (uShift <= 31)
9254 {
9255 puDst->au32[0] = uSrc1.au32[0] << uShift;
9256 puDst->au32[1] = uSrc1.au32[1] << uShift;
9257 puDst->au32[2] = uSrc1.au32[2] << uShift;
9258 puDst->au32[3] = uSrc1.au32[3] << uShift;
9259 }
9260 else
9261 {
9262 puDst->au64[0] = 0;
9263 puDst->au64[1] = 0;
9264 }
9265}
9266
9267#endif
9268
9269
9270/*
9271 * PSRLQ / VPSRLQ
9272 */
9273#ifdef IEM_WITHOUT_ASSEMBLY
9274
9275IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9276{
9277 RTUINT64U uSrc1 = { *puDst };
9278 RTUINT64U uSrc2 = { *puSrc };
9279 RTUINT64U uDst;
9280
9281 if (uSrc2.au64[0] <= 63)
9282 {
9283 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
9284 }
9285 else
9286 {
9287 uDst.au64[0] = 0;
9288 }
9289 *puDst = uDst.u;
9290}
9291
9292
9293IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9294{
9295 RTUINT64U uSrc1 = { *puDst };
9296 RTUINT64U uDst;
9297
9298 if (uShift <= 63)
9299 {
9300 uDst.au64[0] = uSrc1.au64[0] >> uShift;
9301 }
9302 else
9303 {
9304 uDst.au64[0] = 0;
9305 }
9306 *puDst = uDst.u;
9307}
9308
9309
9310IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9311{
9312 RTUINT128U uSrc1 = *puDst;
9313
9314 if (puSrc->au64[0] <= 63)
9315 {
9316 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
9317 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
9318 }
9319 else
9320 {
9321 puDst->au64[0] = 0;
9322 puDst->au64[1] = 0;
9323 }
9324}
9325
9326IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9327{
9328 RTUINT128U uSrc1 = *puDst;
9329
9330 if (uShift <= 63)
9331 {
9332 puDst->au64[0] = uSrc1.au64[0] >> uShift;
9333 puDst->au64[1] = uSrc1.au64[1] >> uShift;
9334 }
9335 else
9336 {
9337 puDst->au64[0] = 0;
9338 puDst->au64[1] = 0;
9339 }
9340}
9341
9342#endif
9343
9344
9345/*
9346 * PSLLQ / VPSLLQ
9347 */
9348#ifdef IEM_WITHOUT_ASSEMBLY
9349
9350IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
9351{
9352 RTUINT64U uSrc1 = { *puDst };
9353 RTUINT64U uSrc2 = { *puSrc };
9354 RTUINT64U uDst;
9355
9356 if (uSrc2.au64[0] <= 63)
9357 {
9358 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
9359 }
9360 else
9361 {
9362 uDst.au64[0] = 0;
9363 }
9364 *puDst = uDst.u;
9365}
9366
9367
9368IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
9369{
9370 RTUINT64U uSrc1 = { *puDst };
9371 RTUINT64U uDst;
9372
9373 if (uShift <= 63)
9374 {
9375 uDst.au64[0] = uSrc1.au64[0] << uShift;
9376 }
9377 else
9378 {
9379 uDst.au64[0] = 0;
9380 }
9381 *puDst = uDst.u;
9382}
9383
9384
9385IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9386{
9387 RTUINT128U uSrc1 = *puDst;
9388
9389 if (puSrc->au64[0] <= 63)
9390 {
9391 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
9392 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
9393 }
9394 else
9395 {
9396 puDst->au64[0] = 0;
9397 puDst->au64[1] = 0;
9398 }
9399}
9400
9401IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9402{
9403 RTUINT128U uSrc1 = *puDst;
9404
9405 if (uShift <= 63)
9406 {
9407 puDst->au64[0] = uSrc1.au64[0] << uShift;
9408 puDst->au64[1] = uSrc1.au64[1] << uShift;
9409 }
9410 else
9411 {
9412 puDst->au64[0] = 0;
9413 puDst->au64[1] = 0;
9414 }
9415}
9416
9417#endif
9418
9419
9420/*
9421 * PSRLDQ / VPSRLDQ
9422 */
9423#ifdef IEM_WITHOUT_ASSEMBLY
9424
9425IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9426{
9427 RTUINT128U uSrc1 = *puDst;
9428
9429 if (uShift < 16)
9430 {
9431 int i;
9432
9433 for (i = 0; i < 16 - uShift; ++i)
9434 puDst->au8[i] = uSrc1.au8[i + uShift];
9435 for (i = 16 - uShift; i < 16; ++i)
9436 puDst->au8[i] = 0;
9437 }
9438 else
9439 {
9440 puDst->au64[0] = 0;
9441 puDst->au64[1] = 0;
9442 }
9443}
9444
9445#endif
9446
9447
9448/*
9449 * PSLLDQ / VPSLLDQ
9450 */
9451#ifdef IEM_WITHOUT_ASSEMBLY
9452
9453IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9454{
9455 RTUINT128U uSrc1 = *puDst;
9456
9457 if (uShift < 16)
9458 {
9459 int i;
9460
9461 for (i = 0; i < uShift; ++i)
9462 puDst->au8[i] = 0;
9463 for (i = uShift; i < 16; ++i)
9464 puDst->au8[i] = uSrc1.au8[i - uShift];
9465 }
9466 else
9467 {
9468 puDst->au64[0] = 0;
9469 puDst->au64[1] = 0;
9470 }
9471}
9472
9473#endif
9474
9475
9476/*
9477 * PMADDWD / VPMADDWD
9478 */
9479#ifdef IEM_WITHOUT_ASSEMBLY
9480
9481IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9482{
9483 RTUINT64U uSrc1 = { *puDst };
9484 RTUINT64U uSrc2 = { *puSrc };
9485 RTUINT64U uDst;
9486
9487 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
9488 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
9489 *puDst = uDst.u;
9490 RT_NOREF(pFpuState);
9491}
9492
9493
9494IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9495{
9496 RTUINT128U uSrc1 = *puDst;
9497
9498 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
9499 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
9500 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
9501 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
9502 RT_NOREF(pFpuState);
9503}
9504
9505#endif
9506
9507
9508/*
9509 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
9510 */
9511#ifdef IEM_WITHOUT_ASSEMBLY
9512
9513IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9514{
9515 RTUINT64U uSrc1 = { *puDst };
9516 RTUINT64U uSrc2 = { *puSrc };
9517 RTUINT64U uDst;
9518
9519 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
9520 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
9521 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
9522 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
9523 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
9524 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
9525 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
9526 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
9527 *puDst = uDst.u;
9528 RT_NOREF(pFpuState);
9529}
9530
9531
9532IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9533{
9534 RTUINT128U uSrc1 = *puDst;
9535
9536 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
9537 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
9538 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
9539 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
9540 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
9541 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
9542 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
9543 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
9544 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
9545 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
9546 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
9547 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
9548 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
9549 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
9550 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
9551 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
9552 RT_NOREF(pFpuState);
9553}
9554
9555#endif
9556
9557
9558IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9559{
9560 RTUINT128U uSrc1 = *puDst;
9561
9562 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
9563 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
9564 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
9565 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
9566 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
9567 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
9568 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
9569 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
9570 RT_NOREF(pFpuState);
9571}
9572
9573
9574IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9575{
9576 RTUINT128U uSrc1 = *puDst;
9577
9578 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
9579 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
9580 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
9581 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
9582 RT_NOREF(pFpuState);
9583}
9584
9585
9586IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9587 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9588{
9589 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9590 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9591 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9592 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9593 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9594 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9595 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9596 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9597 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9598 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9599 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9600 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9601 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9602 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9603 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9604 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9605 RT_NOREF(pExtState);
9606}
9607
9608
9609IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9610 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9611{
9612 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9613 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9614 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9615 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9616 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9617 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9618 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9619 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9620 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
9621 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
9622 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
9623 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
9624 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
9625 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
9626 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
9627 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
9628 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
9629 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
9630 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
9631 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
9632 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
9633 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
9634 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
9635 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
9636 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
9637 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
9638 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
9639 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
9640 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
9641 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
9642 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
9643 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
9644 RT_NOREF(pExtState);
9645}
9646
9647
9648IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9649 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9650{
9651 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9652 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9653 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9654 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9655 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9656 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9657 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9658 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9659 RT_NOREF(pExtState);
9660}
9661
9662
9663IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9664 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9665{
9666 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
9667 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
9668 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
9669 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
9670 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
9671 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
9672 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
9673 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
9674 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
9675 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
9676 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
9677 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
9678 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
9679 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
9680 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
9681 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
9682 RT_NOREF(pExtState);
9683}
9684
9685
9686IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9687 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9688{
9689 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9690 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9691 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9692 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9693 RT_NOREF(pExtState);
9694}
9695
9696
9697IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9698 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9699{
9700 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
9701 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
9702 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
9703 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
9704 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
9705 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
9706 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
9707 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
9708 RT_NOREF(pExtState);
9709}
9710
9711
9712/*
9713 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
9714 */
9715#ifdef IEM_WITHOUT_ASSEMBLY
9716
9717IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9718{
9719 RTUINT64U uSrc1 = { *puDst };
9720 RTUINT64U uSrc2 = { *puSrc };
9721 RTUINT64U uDst;
9722
9723 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
9724 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
9725 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
9726 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
9727 *puDst = uDst.u;
9728 RT_NOREF(pFpuState);
9729}
9730
9731
9732IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9733{
9734 RTUINT128U uSrc1 = *puDst;
9735
9736 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
9737 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
9738 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
9739 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
9740 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
9741 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
9742 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
9743 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
9744 RT_NOREF(pFpuState);
9745}
9746
9747#endif
9748
9749IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9750{
9751 RTUINT128U uSrc1 = *puDst;
9752
9753 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
9754 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
9755 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
9756 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
9757 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
9758 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
9759 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
9760 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
9761 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
9762 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
9763 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
9764 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
9765 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
9766 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
9767 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
9768 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
9769 RT_NOREF(pFpuState);
9770}
9771
9772
9773IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9774{
9775 RTUINT128U uSrc1 = *puDst;
9776
9777 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
9778 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
9779 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
9780 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
9781 RT_NOREF(pFpuState);
9782}
9783
9784
9785IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9786 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9787{
9788 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9789 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9790 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9791 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9792 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9793 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9794 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
9795 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
9796 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
9797 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
9798 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
9799 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
9800 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
9801 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
9802 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
9803 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
9804 RT_NOREF(pExtState);
9805}
9806
9807
9808IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9810{
9811 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
9812 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
9813 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
9814 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
9815 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
9816 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
9817 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
9818 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
9819 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
9820 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
9821 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
9822 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
9823 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
9824 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
9825 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
9826 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
9827 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
9828 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
9829 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
9830 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
9831 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
9832 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
9833 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
9834 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
9835 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
9836 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
9837 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
9838 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
9839 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
9840 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
9841 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
9842 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
9843 RT_NOREF(pExtState);
9844}
9845
9846
9847IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9848 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9849{
9850 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
9851 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
9852 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
9853 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
9854 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
9855 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
9856 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
9857 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
9858 RT_NOREF(pExtState);
9859}
9860
9861
9862IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9863 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9864{
9865 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
9866 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
9867 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
9868 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
9869 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
9870 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
9871 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
9872 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
9873 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
9874 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
9875 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
9876 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
9877 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
9878 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
9879 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
9880 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
9881 RT_NOREF(pExtState);
9882}
9883
9884
9885IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9886 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9887{
9888 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
9889 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
9890 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
9891 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
9892 RT_NOREF(pExtState);
9893}
9894
9895
9896IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9897 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9898{
9899 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
9900 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
9901 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
9902 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
9903 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
9904 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
9905 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
9906 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
9907 RT_NOREF(pExtState);
9908}
9909
9910
9911/*
9912 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
9913 */
9914#ifdef IEM_WITHOUT_ASSEMBLY
9915
9916IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9917{
9918 RTUINT64U uSrc1 = { *puDst };
9919 RTUINT64U uSrc2 = { *puSrc };
9920 RTUINT64U uDst;
9921
9922 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
9923 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
9924 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
9925 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
9926 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
9927 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
9928 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
9929 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
9930 *puDst = uDst.u;
9931 RT_NOREF(pFpuState);
9932}
9933
9934
9935IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9936{
9937 RTUINT128U uSrc1 = *puDst;
9938
9939 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
9940 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
9941 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
9942 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
9943 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
9944 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
9945 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
9946 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
9947 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
9948 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
9949 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
9950 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
9951 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
9952 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
9953 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
9954 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
9955 RT_NOREF(pFpuState);
9956}
9957
9958#endif
9959
9960IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9961{
9962 RTUINT128U uSrc1 = *puDst;
9963
9964 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
9965 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
9966 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
9967 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
9968 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
9969 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
9970 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
9971 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
9972 RT_NOREF(pFpuState);
9973}
9974
9975
9976IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9977{
9978 RTUINT128U uSrc1 = *puDst;
9979
9980 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
9981 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
9982 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
9983 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
9984 RT_NOREF(pFpuState);
9985}
9986
9987
9988IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9989 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9990{
9991 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
9992 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
9993 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
9994 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
9995 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
9996 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
9997 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
9998 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
9999 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10000 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10001 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10002 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10003 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10004 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10005 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10006 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10007 RT_NOREF(pExtState);
10008}
10009
10010
10011IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10012 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10013{
10014 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10015 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10016 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10017 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10018 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10019 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10020 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10021 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10022 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10023 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10024 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10025 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10026 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10027 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10028 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10029 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10030 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10031 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10032 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10033 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10034 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10035 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10036 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10037 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10038 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10039 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10040 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10041 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10042 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10043 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10044 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10045 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10046 RT_NOREF(pExtState);
10047}
10048
10049
10050IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10051 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10052{
10053 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10054 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10055 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10056 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10057 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10058 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10059 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10060 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10061 RT_NOREF(pExtState);
10062}
10063
10064
10065IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10066 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10067{
10068 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10069 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10070 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10071 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10072 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10073 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10074 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10075 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10076 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10077 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10078 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10079 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10080 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10081 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10082 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10083 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10084 RT_NOREF(pExtState);
10085}
10086
10087
10088IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10089 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10090{
10091 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10092 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10093 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10094 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10095 RT_NOREF(pExtState);
10096}
10097
10098
10099IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10100 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10101{
10102 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10103 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10104 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10105 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10106 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10107 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10108 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10109 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10110 RT_NOREF(pExtState);
10111}
10112
10113
10114/*
10115 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10116 */
10117#ifdef IEM_WITHOUT_ASSEMBLY
10118
10119IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10120{
10121 RTUINT64U uSrc1 = { *puDst };
10122 RTUINT64U uSrc2 = { *puSrc };
10123 RTUINT64U uDst;
10124
10125 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10126 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10127 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10128 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10129 *puDst = uDst.u;
10130 RT_NOREF(pFpuState);
10131}
10132
10133
10134IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10135{
10136 RTUINT128U uSrc1 = *puDst;
10137
10138 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10139 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10140 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10141 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10142 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10143 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10144 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10145 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10146 RT_NOREF(pFpuState);
10147}
10148
10149#endif
10150
10151IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10152{
10153 RTUINT128U uSrc1 = *puDst;
10154
10155 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10156 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10157 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10158 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10159 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10160 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10161 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10162 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10163 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10164 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10165 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10166 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10167 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10168 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10169 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10170 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10171 RT_NOREF(pFpuState);
10172}
10173
10174
10175IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10176{
10177 RTUINT128U uSrc1 = *puDst;
10178
10179 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10180 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10181 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10182 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10183 RT_NOREF(pFpuState);
10184}
10185
10186
10187IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10188 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10189{
10190 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10191 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10192 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10193 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10194 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10195 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10196 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10197 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10198 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10199 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10200 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10201 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10202 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10203 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10204 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10205 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10206 RT_NOREF(pExtState);
10207}
10208
10209
10210IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10211 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10212{
10213 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10214 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10215 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10216 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10217 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10218 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10219 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10220 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10221 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10222 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10223 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10224 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10225 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10226 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10227 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10228 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10229 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10230 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10231 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10232 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10233 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10234 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10235 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10236 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10237 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10238 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10239 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10240 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10241 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10242 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10243 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10244 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
10245 RT_NOREF(pExtState);
10246}
10247
10248
10249IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10250 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10251{
10252 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10253 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10254 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10255 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10256 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10257 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10258 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10259 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10260 RT_NOREF(pExtState);
10261}
10262
10263
10264IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10265 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10266{
10267 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10268 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10269 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10270 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10271 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10272 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10273 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10274 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10275 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10276 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10277 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
10278 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
10279 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
10280 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
10281 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
10282 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
10283 RT_NOREF(pExtState);
10284}
10285
10286
10287IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10288 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10289{
10290 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10291 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10292 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10293 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10294 RT_NOREF(pExtState);
10295}
10296
10297
10298IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10299 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10300{
10301 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10302 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10303 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10304 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10305 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10306 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10307 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10308 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10309 RT_NOREF(pExtState);
10310}
10311
10312
10313/*
10314 * PMOVMSKB / VPMOVMSKB
10315 */
10316#ifdef IEM_WITHOUT_ASSEMBLY
10317
10318IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
10319{
10320 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10321 uint64_t const uSrc = *pu64Src;
10322 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
10323 | ((uSrc >> (15-1)) & RT_BIT_64(1))
10324 | ((uSrc >> (23-2)) & RT_BIT_64(2))
10325 | ((uSrc >> (31-3)) & RT_BIT_64(3))
10326 | ((uSrc >> (39-4)) & RT_BIT_64(4))
10327 | ((uSrc >> (47-5)) & RT_BIT_64(5))
10328 | ((uSrc >> (55-6)) & RT_BIT_64(6))
10329 | ((uSrc >> (63-7)) & RT_BIT_64(7));
10330}
10331
10332
10333IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
10334{
10335 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10336 uint64_t const uSrc0 = pu128Src->QWords.qw0;
10337 uint64_t const uSrc1 = pu128Src->QWords.qw1;
10338 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10339 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10340 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10341 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10342 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10343 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10344 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10345 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10346 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10347 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10348 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10349 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10350 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10351 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10352 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10353 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
10354}
10355
10356#endif
10357
10358IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
10359{
10360 /* The the most signficant bit from each byte and store them in the given general purpose register. */
10361 uint64_t const uSrc0 = puSrc->QWords.qw0;
10362 uint64_t const uSrc1 = puSrc->QWords.qw1;
10363 uint64_t const uSrc2 = puSrc->QWords.qw2;
10364 uint64_t const uSrc3 = puSrc->QWords.qw3;
10365 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
10366 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
10367 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
10368 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
10369 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
10370 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
10371 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
10372 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
10373 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
10374 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
10375 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
10376 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
10377 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
10378 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
10379 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
10380 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
10381 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
10382 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
10383 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
10384 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
10385 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
10386 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
10387 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
10388 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
10389 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
10390 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
10391 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
10392 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
10393 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
10394 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
10395 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
10396 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
10397}
10398
10399
10400/*
10401 * [V]PSHUFB
10402 */
10403
10404IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10405{
10406 RTUINT64U const uSrc = { *puSrc };
10407 RTUINT64U const uDstIn = { *puDst };
10408 ASMCompilerBarrier();
10409 RTUINT64U uDstOut = { 0 };
10410 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
10411 {
10412 uint8_t idxSrc = uSrc.au8[iByte];
10413 if (!(idxSrc & 0x80))
10414 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
10415 }
10416 *puDst = uDstOut.u;
10417 RT_NOREF(pFpuState);
10418}
10419
10420
10421IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10422{
10423 RTUINT128U const uSrc = *puSrc;
10424 RTUINT128U const uDstIn = *puDst;
10425 ASMCompilerBarrier();
10426 puDst->au64[0] = 0;
10427 puDst->au64[1] = 0;
10428 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10429 {
10430 uint8_t idxSrc = uSrc.au8[iByte];
10431 if (!(idxSrc & 0x80))
10432 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
10433 }
10434 RT_NOREF(pFpuState);
10435}
10436
10437
10438IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10439 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10440{
10441 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
10442 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
10443 ASMCompilerBarrier();
10444 puDst->au64[0] = 0;
10445 puDst->au64[1] = 0;
10446 for (unsigned iByte = 0; iByte < 16; iByte++)
10447 {
10448 uint8_t idxSrc = uSrc2.au8[iByte];
10449 if (!(idxSrc & 0x80))
10450 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10451 }
10452 RT_NOREF(pExtState);
10453}
10454
10455
10456IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10457 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10458{
10459 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
10460 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
10461 ASMCompilerBarrier();
10462 puDst->au64[0] = 0;
10463 puDst->au64[1] = 0;
10464 puDst->au64[2] = 0;
10465 puDst->au64[3] = 0;
10466 for (unsigned iByte = 0; iByte < 16; iByte++)
10467 {
10468 uint8_t idxSrc = uSrc2.au8[iByte];
10469 if (!(idxSrc & 0x80))
10470 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
10471 }
10472 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
10473 {
10474 uint8_t idxSrc = uSrc2.au8[iByte];
10475 if (!(idxSrc & 0x80))
10476 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
10477 }
10478 RT_NOREF(pExtState);
10479}
10480
10481
10482/*
10483 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
10484 */
10485#ifdef IEM_WITHOUT_ASSEMBLY
10486
10487IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
10488{
10489 uint64_t const uSrc = *puSrc;
10490 ASMCompilerBarrier();
10491 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10492 uSrc >> (((bEvil >> 2) & 3) * 16),
10493 uSrc >> (((bEvil >> 4) & 3) * 16),
10494 uSrc >> (((bEvil >> 6) & 3) * 16));
10495}
10496
10497
10498IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10499{
10500 puDst->QWords.qw0 = puSrc->QWords.qw0;
10501 uint64_t const uSrc = puSrc->QWords.qw1;
10502 ASMCompilerBarrier();
10503 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10504 uSrc >> (((bEvil >> 2) & 3) * 16),
10505 uSrc >> (((bEvil >> 4) & 3) * 16),
10506 uSrc >> (((bEvil >> 6) & 3) * 16));
10507}
10508
10509#endif
10510
10511IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10512{
10513 puDst->QWords.qw0 = puSrc->QWords.qw0;
10514 uint64_t const uSrc1 = puSrc->QWords.qw1;
10515 puDst->QWords.qw2 = puSrc->QWords.qw2;
10516 uint64_t const uSrc3 = puSrc->QWords.qw3;
10517 ASMCompilerBarrier();
10518 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
10519 uSrc1 >> (((bEvil >> 2) & 3) * 16),
10520 uSrc1 >> (((bEvil >> 4) & 3) * 16),
10521 uSrc1 >> (((bEvil >> 6) & 3) * 16));
10522 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
10523 uSrc3 >> (((bEvil >> 2) & 3) * 16),
10524 uSrc3 >> (((bEvil >> 4) & 3) * 16),
10525 uSrc3 >> (((bEvil >> 6) & 3) * 16));
10526}
10527
10528#ifdef IEM_WITHOUT_ASSEMBLY
10529IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10530{
10531 puDst->QWords.qw1 = puSrc->QWords.qw1;
10532 uint64_t const uSrc = puSrc->QWords.qw0;
10533 ASMCompilerBarrier();
10534 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
10535 uSrc >> (((bEvil >> 2) & 3) * 16),
10536 uSrc >> (((bEvil >> 4) & 3) * 16),
10537 uSrc >> (((bEvil >> 6) & 3) * 16));
10538
10539}
10540#endif
10541
10542
10543IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10544{
10545 puDst->QWords.qw3 = puSrc->QWords.qw3;
10546 uint64_t const uSrc2 = puSrc->QWords.qw2;
10547 puDst->QWords.qw1 = puSrc->QWords.qw1;
10548 uint64_t const uSrc0 = puSrc->QWords.qw0;
10549 ASMCompilerBarrier();
10550 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
10551 uSrc0 >> (((bEvil >> 2) & 3) * 16),
10552 uSrc0 >> (((bEvil >> 4) & 3) * 16),
10553 uSrc0 >> (((bEvil >> 6) & 3) * 16));
10554 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
10555 uSrc2 >> (((bEvil >> 2) & 3) * 16),
10556 uSrc2 >> (((bEvil >> 4) & 3) * 16),
10557 uSrc2 >> (((bEvil >> 6) & 3) * 16));
10558
10559}
10560
10561
10562#ifdef IEM_WITHOUT_ASSEMBLY
10563IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
10564{
10565 RTUINT128U const uSrc = *puSrc;
10566 ASMCompilerBarrier();
10567 puDst->au32[0] = uSrc.au32[bEvil & 3];
10568 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
10569 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
10570 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
10571}
10572#endif
10573
10574
10575IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
10576{
10577 RTUINT256U const uSrc = *puSrc;
10578 ASMCompilerBarrier();
10579 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
10580 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
10581 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
10582 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
10583 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
10584 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
10585 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
10586 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
10587}
10588
10589
10590/*
10591 * PUNPCKHBW - high bytes -> words
10592 */
10593#ifdef IEM_WITHOUT_ASSEMBLY
10594
10595IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10596{
10597 RTUINT64U const uSrc2 = { *puSrc };
10598 RTUINT64U const uSrc1 = { *puDst };
10599 ASMCompilerBarrier();
10600 RTUINT64U uDstOut;
10601 uDstOut.au8[0] = uSrc1.au8[4];
10602 uDstOut.au8[1] = uSrc2.au8[4];
10603 uDstOut.au8[2] = uSrc1.au8[5];
10604 uDstOut.au8[3] = uSrc2.au8[5];
10605 uDstOut.au8[4] = uSrc1.au8[6];
10606 uDstOut.au8[5] = uSrc2.au8[6];
10607 uDstOut.au8[6] = uSrc1.au8[7];
10608 uDstOut.au8[7] = uSrc2.au8[7];
10609 *puDst = uDstOut.u;
10610}
10611
10612
10613IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10614{
10615 RTUINT128U const uSrc2 = *puSrc;
10616 RTUINT128U const uSrc1 = *puDst;
10617 ASMCompilerBarrier();
10618 RTUINT128U uDstOut;
10619 uDstOut.au8[ 0] = uSrc1.au8[ 8];
10620 uDstOut.au8[ 1] = uSrc2.au8[ 8];
10621 uDstOut.au8[ 2] = uSrc1.au8[ 9];
10622 uDstOut.au8[ 3] = uSrc2.au8[ 9];
10623 uDstOut.au8[ 4] = uSrc1.au8[10];
10624 uDstOut.au8[ 5] = uSrc2.au8[10];
10625 uDstOut.au8[ 6] = uSrc1.au8[11];
10626 uDstOut.au8[ 7] = uSrc2.au8[11];
10627 uDstOut.au8[ 8] = uSrc1.au8[12];
10628 uDstOut.au8[ 9] = uSrc2.au8[12];
10629 uDstOut.au8[10] = uSrc1.au8[13];
10630 uDstOut.au8[11] = uSrc2.au8[13];
10631 uDstOut.au8[12] = uSrc1.au8[14];
10632 uDstOut.au8[13] = uSrc2.au8[14];
10633 uDstOut.au8[14] = uSrc1.au8[15];
10634 uDstOut.au8[15] = uSrc2.au8[15];
10635 *puDst = uDstOut;
10636}
10637
10638#endif
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10641{
10642 RTUINT128U const uSrc2 = *puSrc2;
10643 RTUINT128U const uSrc1 = *puSrc1;
10644 ASMCompilerBarrier();
10645 RTUINT128U uDstOut;
10646 uDstOut.au8[ 0] = uSrc1.au8[ 8];
10647 uDstOut.au8[ 1] = uSrc2.au8[ 8];
10648 uDstOut.au8[ 2] = uSrc1.au8[ 9];
10649 uDstOut.au8[ 3] = uSrc2.au8[ 9];
10650 uDstOut.au8[ 4] = uSrc1.au8[10];
10651 uDstOut.au8[ 5] = uSrc2.au8[10];
10652 uDstOut.au8[ 6] = uSrc1.au8[11];
10653 uDstOut.au8[ 7] = uSrc2.au8[11];
10654 uDstOut.au8[ 8] = uSrc1.au8[12];
10655 uDstOut.au8[ 9] = uSrc2.au8[12];
10656 uDstOut.au8[10] = uSrc1.au8[13];
10657 uDstOut.au8[11] = uSrc2.au8[13];
10658 uDstOut.au8[12] = uSrc1.au8[14];
10659 uDstOut.au8[13] = uSrc2.au8[14];
10660 uDstOut.au8[14] = uSrc1.au8[15];
10661 uDstOut.au8[15] = uSrc2.au8[15];
10662 *puDst = uDstOut;
10663}
10664
10665
10666IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10667{
10668 RTUINT256U const uSrc2 = *puSrc2;
10669 RTUINT256U const uSrc1 = *puSrc1;
10670 ASMCompilerBarrier();
10671 RTUINT256U uDstOut;
10672 uDstOut.au8[ 0] = uSrc1.au8[ 8];
10673 uDstOut.au8[ 1] = uSrc2.au8[ 8];
10674 uDstOut.au8[ 2] = uSrc1.au8[ 9];
10675 uDstOut.au8[ 3] = uSrc2.au8[ 9];
10676 uDstOut.au8[ 4] = uSrc1.au8[10];
10677 uDstOut.au8[ 5] = uSrc2.au8[10];
10678 uDstOut.au8[ 6] = uSrc1.au8[11];
10679 uDstOut.au8[ 7] = uSrc2.au8[11];
10680 uDstOut.au8[ 8] = uSrc1.au8[12];
10681 uDstOut.au8[ 9] = uSrc2.au8[12];
10682 uDstOut.au8[10] = uSrc1.au8[13];
10683 uDstOut.au8[11] = uSrc2.au8[13];
10684 uDstOut.au8[12] = uSrc1.au8[14];
10685 uDstOut.au8[13] = uSrc2.au8[14];
10686 uDstOut.au8[14] = uSrc1.au8[15];
10687 uDstOut.au8[15] = uSrc2.au8[15];
10688 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
10689 uDstOut.au8[16] = uSrc1.au8[24];
10690 uDstOut.au8[17] = uSrc2.au8[24];
10691 uDstOut.au8[18] = uSrc1.au8[25];
10692 uDstOut.au8[19] = uSrc2.au8[25];
10693 uDstOut.au8[20] = uSrc1.au8[26];
10694 uDstOut.au8[21] = uSrc2.au8[26];
10695 uDstOut.au8[22] = uSrc1.au8[27];
10696 uDstOut.au8[23] = uSrc2.au8[27];
10697 uDstOut.au8[24] = uSrc1.au8[28];
10698 uDstOut.au8[25] = uSrc2.au8[28];
10699 uDstOut.au8[26] = uSrc1.au8[29];
10700 uDstOut.au8[27] = uSrc2.au8[29];
10701 uDstOut.au8[28] = uSrc1.au8[30];
10702 uDstOut.au8[29] = uSrc2.au8[30];
10703 uDstOut.au8[30] = uSrc1.au8[31];
10704 uDstOut.au8[31] = uSrc2.au8[31];
10705 *puDst = uDstOut;
10706}
10707
10708
10709/*
10710 * PUNPCKHBW - high words -> dwords
10711 */
10712#ifdef IEM_WITHOUT_ASSEMBLY
10713
10714IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
10715{
10716 RTUINT64U const uSrc2 = { *puSrc };
10717 RTUINT64U const uSrc1 = { *puDst };
10718 ASMCompilerBarrier();
10719 RTUINT64U uDstOut;
10720 uDstOut.au16[0] = uSrc1.au16[2];
10721 uDstOut.au16[1] = uSrc2.au16[2];
10722 uDstOut.au16[2] = uSrc1.au16[3];
10723 uDstOut.au16[3] = uSrc2.au16[3];
10724 *puDst = uDstOut.u;
10725}
10726
10727
10728IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10729{
10730 RTUINT128U const uSrc2 = *puSrc;
10731 RTUINT128U const uSrc1 = *puDst;
10732 ASMCompilerBarrier();
10733 RTUINT128U uDstOut;
10734 uDstOut.au16[0] = uSrc1.au16[4];
10735 uDstOut.au16[1] = uSrc2.au16[4];
10736 uDstOut.au16[2] = uSrc1.au16[5];
10737 uDstOut.au16[3] = uSrc2.au16[5];
10738 uDstOut.au16[4] = uSrc1.au16[6];
10739 uDstOut.au16[5] = uSrc2.au16[6];
10740 uDstOut.au16[6] = uSrc1.au16[7];
10741 uDstOut.au16[7] = uSrc2.au16[7];
10742 *puDst = uDstOut;
10743}
10744
10745#endif
10746
10747IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10748{
10749 RTUINT128U const uSrc2 = *puSrc2;
10750 RTUINT128U const uSrc1 = *puSrc1;
10751 ASMCompilerBarrier();
10752 RTUINT128U uDstOut;
10753 uDstOut.au16[0] = uSrc1.au16[4];
10754 uDstOut.au16[1] = uSrc2.au16[4];
10755 uDstOut.au16[2] = uSrc1.au16[5];
10756 uDstOut.au16[3] = uSrc2.au16[5];
10757 uDstOut.au16[4] = uSrc1.au16[6];
10758 uDstOut.au16[5] = uSrc2.au16[6];
10759 uDstOut.au16[6] = uSrc1.au16[7];
10760 uDstOut.au16[7] = uSrc2.au16[7];
10761 *puDst = uDstOut;
10762}
10763
10764
10765IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10766{
10767 RTUINT256U const uSrc2 = *puSrc2;
10768 RTUINT256U const uSrc1 = *puSrc1;
10769 ASMCompilerBarrier();
10770 RTUINT256U uDstOut;
10771 uDstOut.au16[0] = uSrc1.au16[4];
10772 uDstOut.au16[1] = uSrc2.au16[4];
10773 uDstOut.au16[2] = uSrc1.au16[5];
10774 uDstOut.au16[3] = uSrc2.au16[5];
10775 uDstOut.au16[4] = uSrc1.au16[6];
10776 uDstOut.au16[5] = uSrc2.au16[6];
10777 uDstOut.au16[6] = uSrc1.au16[7];
10778 uDstOut.au16[7] = uSrc2.au16[7];
10779
10780 uDstOut.au16[8] = uSrc1.au16[12];
10781 uDstOut.au16[9] = uSrc2.au16[12];
10782 uDstOut.au16[10] = uSrc1.au16[13];
10783 uDstOut.au16[11] = uSrc2.au16[13];
10784 uDstOut.au16[12] = uSrc1.au16[14];
10785 uDstOut.au16[13] = uSrc2.au16[14];
10786 uDstOut.au16[14] = uSrc1.au16[15];
10787 uDstOut.au16[15] = uSrc2.au16[15];
10788 *puDst = uDstOut;
10789}
10790
10791
10792/*
10793 * PUNPCKHBW - high dwords -> qword(s)
10794 */
10795#ifdef IEM_WITHOUT_ASSEMBLY
10796
10797IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10798{
10799 RTUINT64U const uSrc2 = { *puSrc };
10800 RTUINT64U const uSrc1 = { *puDst };
10801 ASMCompilerBarrier();
10802 RTUINT64U uDstOut;
10803 uDstOut.au32[0] = uSrc1.au32[1];
10804 uDstOut.au32[1] = uSrc2.au32[1];
10805 *puDst = uDstOut.u;
10806}
10807
10808
10809IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10810{
10811 RTUINT128U const uSrc2 = *puSrc;
10812 RTUINT128U const uSrc1 = *puDst;
10813 ASMCompilerBarrier();
10814 RTUINT128U uDstOut;
10815 uDstOut.au32[0] = uSrc1.au32[2];
10816 uDstOut.au32[1] = uSrc2.au32[2];
10817 uDstOut.au32[2] = uSrc1.au32[3];
10818 uDstOut.au32[3] = uSrc2.au32[3];
10819 *puDst = uDstOut;
10820}
10821
10822#endif
10823
10824IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10825{
10826 RTUINT128U const uSrc2 = *puSrc2;
10827 RTUINT128U const uSrc1 = *puSrc1;
10828 ASMCompilerBarrier();
10829 RTUINT128U uDstOut;
10830 uDstOut.au32[0] = uSrc1.au32[2];
10831 uDstOut.au32[1] = uSrc2.au32[2];
10832 uDstOut.au32[2] = uSrc1.au32[3];
10833 uDstOut.au32[3] = uSrc2.au32[3];
10834 *puDst = uDstOut;
10835}
10836
10837
10838IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10839{
10840 RTUINT256U const uSrc2 = *puSrc2;
10841 RTUINT256U const uSrc1 = *puSrc1;
10842 ASMCompilerBarrier();
10843 RTUINT256U uDstOut;
10844 uDstOut.au32[0] = uSrc1.au32[2];
10845 uDstOut.au32[1] = uSrc2.au32[2];
10846 uDstOut.au32[2] = uSrc1.au32[3];
10847 uDstOut.au32[3] = uSrc2.au32[3];
10848
10849 uDstOut.au32[4] = uSrc1.au32[6];
10850 uDstOut.au32[5] = uSrc2.au32[6];
10851 uDstOut.au32[6] = uSrc1.au32[7];
10852 uDstOut.au32[7] = uSrc2.au32[7];
10853 *puDst = uDstOut;
10854}
10855
10856
10857/*
10858 * PUNPCKHQDQ -> High qwords -> double qword(s).
10859 */
10860#ifdef IEM_WITHOUT_ASSEMBLY
10861IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10862{
10863 RTUINT128U const uSrc2 = *puSrc;
10864 RTUINT128U const uSrc1 = *puDst;
10865 ASMCompilerBarrier();
10866 RTUINT128U uDstOut;
10867 uDstOut.au64[0] = uSrc1.au64[1];
10868 uDstOut.au64[1] = uSrc2.au64[1];
10869 *puDst = uDstOut;
10870}
10871#endif
10872
10873
10874IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10875{
10876 RTUINT128U const uSrc2 = *puSrc2;
10877 RTUINT128U const uSrc1 = *puSrc1;
10878 ASMCompilerBarrier();
10879 RTUINT128U uDstOut;
10880 uDstOut.au64[0] = uSrc1.au64[1];
10881 uDstOut.au64[1] = uSrc2.au64[1];
10882 *puDst = uDstOut;
10883}
10884
10885
10886IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10887{
10888 RTUINT256U const uSrc2 = *puSrc2;
10889 RTUINT256U const uSrc1 = *puSrc1;
10890 ASMCompilerBarrier();
10891 RTUINT256U uDstOut;
10892 uDstOut.au64[0] = uSrc1.au64[1];
10893 uDstOut.au64[1] = uSrc2.au64[1];
10894
10895 uDstOut.au64[2] = uSrc1.au64[3];
10896 uDstOut.au64[3] = uSrc2.au64[3];
10897 *puDst = uDstOut;
10898}
10899
10900
10901/*
10902 * PUNPCKLBW - low bytes -> words
10903 */
10904#ifdef IEM_WITHOUT_ASSEMBLY
10905
10906IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
10907{
10908 RTUINT64U const uSrc2 = { *puSrc };
10909 RTUINT64U const uSrc1 = { *puDst };
10910 ASMCompilerBarrier();
10911 RTUINT64U uDstOut;
10912 uDstOut.au8[0] = uSrc1.au8[0];
10913 uDstOut.au8[1] = uSrc2.au8[0];
10914 uDstOut.au8[2] = uSrc1.au8[1];
10915 uDstOut.au8[3] = uSrc2.au8[1];
10916 uDstOut.au8[4] = uSrc1.au8[2];
10917 uDstOut.au8[5] = uSrc2.au8[2];
10918 uDstOut.au8[6] = uSrc1.au8[3];
10919 uDstOut.au8[7] = uSrc2.au8[3];
10920 *puDst = uDstOut.u;
10921}
10922
10923
10924IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10925{
10926 RTUINT128U const uSrc2 = *puSrc;
10927 RTUINT128U const uSrc1 = *puDst;
10928 ASMCompilerBarrier();
10929 RTUINT128U uDstOut;
10930 uDstOut.au8[ 0] = uSrc1.au8[0];
10931 uDstOut.au8[ 1] = uSrc2.au8[0];
10932 uDstOut.au8[ 2] = uSrc1.au8[1];
10933 uDstOut.au8[ 3] = uSrc2.au8[1];
10934 uDstOut.au8[ 4] = uSrc1.au8[2];
10935 uDstOut.au8[ 5] = uSrc2.au8[2];
10936 uDstOut.au8[ 6] = uSrc1.au8[3];
10937 uDstOut.au8[ 7] = uSrc2.au8[3];
10938 uDstOut.au8[ 8] = uSrc1.au8[4];
10939 uDstOut.au8[ 9] = uSrc2.au8[4];
10940 uDstOut.au8[10] = uSrc1.au8[5];
10941 uDstOut.au8[11] = uSrc2.au8[5];
10942 uDstOut.au8[12] = uSrc1.au8[6];
10943 uDstOut.au8[13] = uSrc2.au8[6];
10944 uDstOut.au8[14] = uSrc1.au8[7];
10945 uDstOut.au8[15] = uSrc2.au8[7];
10946 *puDst = uDstOut;
10947}
10948
10949#endif
10950
10951IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10952{
10953 RTUINT128U const uSrc2 = *puSrc2;
10954 RTUINT128U const uSrc1 = *puSrc1;
10955 ASMCompilerBarrier();
10956 RTUINT128U uDstOut;
10957 uDstOut.au8[ 0] = uSrc1.au8[0];
10958 uDstOut.au8[ 1] = uSrc2.au8[0];
10959 uDstOut.au8[ 2] = uSrc1.au8[1];
10960 uDstOut.au8[ 3] = uSrc2.au8[1];
10961 uDstOut.au8[ 4] = uSrc1.au8[2];
10962 uDstOut.au8[ 5] = uSrc2.au8[2];
10963 uDstOut.au8[ 6] = uSrc1.au8[3];
10964 uDstOut.au8[ 7] = uSrc2.au8[3];
10965 uDstOut.au8[ 8] = uSrc1.au8[4];
10966 uDstOut.au8[ 9] = uSrc2.au8[4];
10967 uDstOut.au8[10] = uSrc1.au8[5];
10968 uDstOut.au8[11] = uSrc2.au8[5];
10969 uDstOut.au8[12] = uSrc1.au8[6];
10970 uDstOut.au8[13] = uSrc2.au8[6];
10971 uDstOut.au8[14] = uSrc1.au8[7];
10972 uDstOut.au8[15] = uSrc2.au8[7];
10973 *puDst = uDstOut;
10974}
10975
10976
10977IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10978{
10979 RTUINT256U const uSrc2 = *puSrc2;
10980 RTUINT256U const uSrc1 = *puSrc1;
10981 ASMCompilerBarrier();
10982 RTUINT256U uDstOut;
10983 uDstOut.au8[ 0] = uSrc1.au8[0];
10984 uDstOut.au8[ 1] = uSrc2.au8[0];
10985 uDstOut.au8[ 2] = uSrc1.au8[1];
10986 uDstOut.au8[ 3] = uSrc2.au8[1];
10987 uDstOut.au8[ 4] = uSrc1.au8[2];
10988 uDstOut.au8[ 5] = uSrc2.au8[2];
10989 uDstOut.au8[ 6] = uSrc1.au8[3];
10990 uDstOut.au8[ 7] = uSrc2.au8[3];
10991 uDstOut.au8[ 8] = uSrc1.au8[4];
10992 uDstOut.au8[ 9] = uSrc2.au8[4];
10993 uDstOut.au8[10] = uSrc1.au8[5];
10994 uDstOut.au8[11] = uSrc2.au8[5];
10995 uDstOut.au8[12] = uSrc1.au8[6];
10996 uDstOut.au8[13] = uSrc2.au8[6];
10997 uDstOut.au8[14] = uSrc1.au8[7];
10998 uDstOut.au8[15] = uSrc2.au8[7];
10999 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11000 uDstOut.au8[16] = uSrc1.au8[16];
11001 uDstOut.au8[17] = uSrc2.au8[16];
11002 uDstOut.au8[18] = uSrc1.au8[17];
11003 uDstOut.au8[19] = uSrc2.au8[17];
11004 uDstOut.au8[20] = uSrc1.au8[18];
11005 uDstOut.au8[21] = uSrc2.au8[18];
11006 uDstOut.au8[22] = uSrc1.au8[19];
11007 uDstOut.au8[23] = uSrc2.au8[19];
11008 uDstOut.au8[24] = uSrc1.au8[20];
11009 uDstOut.au8[25] = uSrc2.au8[20];
11010 uDstOut.au8[26] = uSrc1.au8[21];
11011 uDstOut.au8[27] = uSrc2.au8[21];
11012 uDstOut.au8[28] = uSrc1.au8[22];
11013 uDstOut.au8[29] = uSrc2.au8[22];
11014 uDstOut.au8[30] = uSrc1.au8[23];
11015 uDstOut.au8[31] = uSrc2.au8[23];
11016 *puDst = uDstOut;
11017}
11018
11019
11020/*
11021 * PUNPCKLBW - low words -> dwords
11022 */
11023#ifdef IEM_WITHOUT_ASSEMBLY
11024
11025IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11026{
11027 RTUINT64U const uSrc2 = { *puSrc };
11028 RTUINT64U const uSrc1 = { *puDst };
11029 ASMCompilerBarrier();
11030 RTUINT64U uDstOut;
11031 uDstOut.au16[0] = uSrc1.au16[0];
11032 uDstOut.au16[1] = uSrc2.au16[0];
11033 uDstOut.au16[2] = uSrc1.au16[1];
11034 uDstOut.au16[3] = uSrc2.au16[1];
11035 *puDst = uDstOut.u;
11036}
11037
11038
11039IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11040{
11041 RTUINT128U const uSrc2 = *puSrc;
11042 RTUINT128U const uSrc1 = *puDst;
11043 ASMCompilerBarrier();
11044 RTUINT128U uDstOut;
11045 uDstOut.au16[0] = uSrc1.au16[0];
11046 uDstOut.au16[1] = uSrc2.au16[0];
11047 uDstOut.au16[2] = uSrc1.au16[1];
11048 uDstOut.au16[3] = uSrc2.au16[1];
11049 uDstOut.au16[4] = uSrc1.au16[2];
11050 uDstOut.au16[5] = uSrc2.au16[2];
11051 uDstOut.au16[6] = uSrc1.au16[3];
11052 uDstOut.au16[7] = uSrc2.au16[3];
11053 *puDst = uDstOut;
11054}
11055
11056#endif
11057
11058IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11059{
11060 RTUINT128U const uSrc2 = *puSrc2;
11061 RTUINT128U const uSrc1 = *puSrc1;
11062 ASMCompilerBarrier();
11063 RTUINT128U uDstOut;
11064 uDstOut.au16[0] = uSrc1.au16[0];
11065 uDstOut.au16[1] = uSrc2.au16[0];
11066 uDstOut.au16[2] = uSrc1.au16[1];
11067 uDstOut.au16[3] = uSrc2.au16[1];
11068 uDstOut.au16[4] = uSrc1.au16[2];
11069 uDstOut.au16[5] = uSrc2.au16[2];
11070 uDstOut.au16[6] = uSrc1.au16[3];
11071 uDstOut.au16[7] = uSrc2.au16[3];
11072 *puDst = uDstOut;
11073}
11074
11075
11076IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11077{
11078 RTUINT256U const uSrc2 = *puSrc2;
11079 RTUINT256U const uSrc1 = *puSrc1;
11080 ASMCompilerBarrier();
11081 RTUINT256U uDstOut;
11082 uDstOut.au16[0] = uSrc1.au16[0];
11083 uDstOut.au16[1] = uSrc2.au16[0];
11084 uDstOut.au16[2] = uSrc1.au16[1];
11085 uDstOut.au16[3] = uSrc2.au16[1];
11086 uDstOut.au16[4] = uSrc1.au16[2];
11087 uDstOut.au16[5] = uSrc2.au16[2];
11088 uDstOut.au16[6] = uSrc1.au16[3];
11089 uDstOut.au16[7] = uSrc2.au16[3];
11090
11091 uDstOut.au16[8] = uSrc1.au16[8];
11092 uDstOut.au16[9] = uSrc2.au16[8];
11093 uDstOut.au16[10] = uSrc1.au16[9];
11094 uDstOut.au16[11] = uSrc2.au16[9];
11095 uDstOut.au16[12] = uSrc1.au16[10];
11096 uDstOut.au16[13] = uSrc2.au16[10];
11097 uDstOut.au16[14] = uSrc1.au16[11];
11098 uDstOut.au16[15] = uSrc2.au16[11];
11099 *puDst = uDstOut;
11100}
11101
11102
11103/*
11104 * PUNPCKLBW - low dwords -> qword(s)
11105 */
11106#ifdef IEM_WITHOUT_ASSEMBLY
11107
11108IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11109{
11110 RTUINT64U const uSrc2 = { *puSrc };
11111 RTUINT64U const uSrc1 = { *puDst };
11112 ASMCompilerBarrier();
11113 RTUINT64U uDstOut;
11114 uDstOut.au32[0] = uSrc1.au32[0];
11115 uDstOut.au32[1] = uSrc2.au32[0];
11116 *puDst = uDstOut.u;
11117}
11118
11119
11120IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11121{
11122 RTUINT128U const uSrc2 = *puSrc;
11123 RTUINT128U const uSrc1 = *puDst;
11124 ASMCompilerBarrier();
11125 RTUINT128U uDstOut;
11126 uDstOut.au32[0] = uSrc1.au32[0];
11127 uDstOut.au32[1] = uSrc2.au32[0];
11128 uDstOut.au32[2] = uSrc1.au32[1];
11129 uDstOut.au32[3] = uSrc2.au32[1];
11130 *puDst = uDstOut;
11131}
11132
11133#endif
11134
11135IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11136{
11137 RTUINT128U const uSrc2 = *puSrc2;
11138 RTUINT128U const uSrc1 = *puSrc1;
11139 ASMCompilerBarrier();
11140 RTUINT128U uDstOut;
11141 uDstOut.au32[0] = uSrc1.au32[0];
11142 uDstOut.au32[1] = uSrc2.au32[0];
11143 uDstOut.au32[2] = uSrc1.au32[1];
11144 uDstOut.au32[3] = uSrc2.au32[1];
11145 *puDst = uDstOut;
11146}
11147
11148
11149IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11150{
11151 RTUINT256U const uSrc2 = *puSrc2;
11152 RTUINT256U const uSrc1 = *puSrc1;
11153 ASMCompilerBarrier();
11154 RTUINT256U uDstOut;
11155 uDstOut.au32[0] = uSrc1.au32[0];
11156 uDstOut.au32[1] = uSrc2.au32[0];
11157 uDstOut.au32[2] = uSrc1.au32[1];
11158 uDstOut.au32[3] = uSrc2.au32[1];
11159
11160 uDstOut.au32[4] = uSrc1.au32[4];
11161 uDstOut.au32[5] = uSrc2.au32[4];
11162 uDstOut.au32[6] = uSrc1.au32[5];
11163 uDstOut.au32[7] = uSrc2.au32[5];
11164 *puDst = uDstOut;
11165}
11166
11167
11168/*
11169 * PUNPCKLQDQ -> Low qwords -> double qword(s).
11170 */
11171#ifdef IEM_WITHOUT_ASSEMBLY
11172IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11173{
11174 RTUINT128U const uSrc2 = *puSrc;
11175 RTUINT128U const uSrc1 = *puDst;
11176 ASMCompilerBarrier();
11177 RTUINT128U uDstOut;
11178 uDstOut.au64[0] = uSrc1.au64[0];
11179 uDstOut.au64[1] = uSrc2.au64[0];
11180 *puDst = uDstOut;
11181}
11182#endif
11183
11184
11185IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11186{
11187 RTUINT128U const uSrc2 = *puSrc2;
11188 RTUINT128U const uSrc1 = *puSrc1;
11189 ASMCompilerBarrier();
11190 RTUINT128U uDstOut;
11191 uDstOut.au64[0] = uSrc1.au64[0];
11192 uDstOut.au64[1] = uSrc2.au64[0];
11193 *puDst = uDstOut;
11194}
11195
11196
11197IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11198{
11199 RTUINT256U const uSrc2 = *puSrc2;
11200 RTUINT256U const uSrc1 = *puSrc1;
11201 ASMCompilerBarrier();
11202 RTUINT256U uDstOut;
11203 uDstOut.au64[0] = uSrc1.au64[0];
11204 uDstOut.au64[1] = uSrc2.au64[0];
11205
11206 uDstOut.au64[2] = uSrc1.au64[2];
11207 uDstOut.au64[3] = uSrc2.au64[2];
11208 *puDst = uDstOut;
11209}
11210
11211
11212/*
11213 * PACKSSWB - signed words -> signed bytes
11214 */
11215
11216#ifdef IEM_WITHOUT_ASSEMBLY
11217
11218IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11219{
11220 RTUINT64U const uSrc2 = { *puSrc };
11221 RTUINT64U const uSrc1 = { *puDst };
11222 ASMCompilerBarrier();
11223 RTUINT64U uDstOut;
11224 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11225 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11226 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11227 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11228 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11229 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11230 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11231 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11232 *puDst = uDstOut.u;
11233}
11234
11235
11236IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11237{
11238 RTUINT128U const uSrc2 = *puSrc;
11239 RTUINT128U const uSrc1 = *puDst;
11240 ASMCompilerBarrier();
11241 RTUINT128U uDstOut;
11242 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11243 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11244 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11245 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11246 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11247 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11248 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11249 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11250 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11251 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11252 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11253 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11254 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11255 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11256 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11257 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11258 *puDst = uDstOut;
11259}
11260
11261#endif
11262
11263IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11264{
11265 RTUINT128U const uSrc2 = *puSrc2;
11266 RTUINT128U const uSrc1 = *puSrc1;
11267 ASMCompilerBarrier();
11268 RTUINT128U uDstOut;
11269 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11270 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11271 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11272 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11273 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11274 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11275 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11276 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11277 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11278 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11279 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11280 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11281 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11282 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11283 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11284 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11285 *puDst = uDstOut;
11286}
11287
11288
11289IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11290{
11291 RTUINT256U const uSrc2 = *puSrc2;
11292 RTUINT256U const uSrc1 = *puSrc1;
11293 ASMCompilerBarrier();
11294 RTUINT256U uDstOut;
11295 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
11296 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
11297 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
11298 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
11299 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
11300 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
11301 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
11302 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
11303 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
11304 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
11305 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
11306 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
11307 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
11308 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
11309 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
11310 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
11311
11312 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
11313 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
11314 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
11315 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
11316 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
11317 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
11318 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
11319 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
11320 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
11321 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
11322 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
11323 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
11324 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
11325 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
11326 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
11327 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
11328 *puDst = uDstOut;
11329}
11330
11331
11332/*
11333 * PACKUSWB - signed words -> unsigned bytes
11334 */
11335#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
11336 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
11337 ? (uint8_t)(a_iWord) \
11338 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
11339
11340#ifdef IEM_WITHOUT_ASSEMBLY
11341
11342IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11343{
11344 RTUINT64U const uSrc2 = { *puSrc };
11345 RTUINT64U const uSrc1 = { *puDst };
11346 ASMCompilerBarrier();
11347 RTUINT64U uDstOut;
11348 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11349 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11350 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11351 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11352 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11353 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11354 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11355 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11356 *puDst = uDstOut.u;
11357}
11358
11359
11360IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11361{
11362 RTUINT128U const uSrc2 = *puSrc;
11363 RTUINT128U const uSrc1 = *puDst;
11364 ASMCompilerBarrier();
11365 RTUINT128U uDstOut;
11366 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11367 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11368 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11369 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11370 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11371 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11372 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11373 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11374 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11375 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11376 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11377 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11378 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11379 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11380 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11381 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11382 *puDst = uDstOut;
11383}
11384
11385#endif
11386
11387IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11388{
11389 RTUINT128U const uSrc2 = *puSrc2;
11390 RTUINT128U const uSrc1 = *puSrc1;
11391 ASMCompilerBarrier();
11392 RTUINT128U uDstOut;
11393 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11394 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11395 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11396 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11397 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11398 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11399 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11400 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11401 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11402 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11403 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11404 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11405 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11406 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11407 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11408 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11409 *puDst = uDstOut;
11410}
11411
11412
11413IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11414{
11415 RTUINT256U const uSrc2 = *puSrc2;
11416 RTUINT256U const uSrc1 = *puSrc1;
11417 ASMCompilerBarrier();
11418 RTUINT256U uDstOut;
11419 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
11420 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
11421 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
11422 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
11423 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
11424 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
11425 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
11426 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
11427 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
11428 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
11429 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
11430 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
11431 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
11432 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
11433 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
11434 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
11435
11436 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
11437 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
11438 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
11439 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
11440 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
11441 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
11442 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
11443 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
11444 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
11445 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
11446 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
11447 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
11448 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
11449 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
11450 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
11451 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
11452 *puDst = uDstOut;
11453}
11454
11455
11456/*
11457 * PACKSSDW - signed dwords -> signed words
11458 */
11459
11460#ifdef IEM_WITHOUT_ASSEMBLY
11461
11462IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11463{
11464 RTUINT64U const uSrc2 = { *puSrc };
11465 RTUINT64U const uSrc1 = { *puDst };
11466 ASMCompilerBarrier();
11467 RTUINT64U uDstOut;
11468 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11469 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11470 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11471 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11472 *puDst = uDstOut.u;
11473}
11474
11475
11476IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11477{
11478 RTUINT128U const uSrc2 = *puSrc;
11479 RTUINT128U const uSrc1 = *puDst;
11480 ASMCompilerBarrier();
11481 RTUINT128U uDstOut;
11482 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11483 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11484 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11485 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11486 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11487 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11488 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11489 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11490 *puDst = uDstOut;
11491}
11492
11493#endif
11494
11495IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11496{
11497 RTUINT128U const uSrc2 = *puSrc2;
11498 RTUINT128U const uSrc1 = *puSrc1;
11499 ASMCompilerBarrier();
11500 RTUINT128U uDstOut;
11501 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11502 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11503 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11504 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11505 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11506 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11507 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11508 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11509 *puDst = uDstOut;
11510}
11511
11512
11513IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11514{
11515 RTUINT256U const uSrc2 = *puSrc2;
11516 RTUINT256U const uSrc1 = *puSrc1;
11517 ASMCompilerBarrier();
11518 RTUINT256U uDstOut;
11519 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
11520 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
11521 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
11522 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
11523 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
11524 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
11525 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
11526 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
11527
11528 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
11529 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
11530 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
11531 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
11532 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
11533 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
11534 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
11535 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
11536 *puDst = uDstOut;
11537}
11538
11539
11540/*
11541 * PACKUSDW - signed dwords -> unsigned words
11542 */
11543#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
11544 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
11545 ? (uint16_t)(a_iDword) \
11546 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
11547
11548#ifdef IEM_WITHOUT_ASSEMBLY
11549IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11550{
11551 RTUINT128U const uSrc2 = *puSrc;
11552 RTUINT128U const uSrc1 = *puDst;
11553 ASMCompilerBarrier();
11554 RTUINT128U uDstOut;
11555 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11556 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11557 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11558 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11559 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11560 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11561 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11562 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11563 *puDst = uDstOut;
11564}
11565#endif
11566
11567IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11568{
11569 RTUINT128U const uSrc2 = *puSrc2;
11570 RTUINT128U const uSrc1 = *puSrc1;
11571 ASMCompilerBarrier();
11572 RTUINT128U uDstOut;
11573 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11574 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11575 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11576 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11577 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11578 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11579 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11580 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11581 *puDst = uDstOut;
11582}
11583
11584
11585IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11586{
11587 RTUINT256U const uSrc2 = *puSrc2;
11588 RTUINT256U const uSrc1 = *puSrc1;
11589 ASMCompilerBarrier();
11590 RTUINT256U uDstOut;
11591 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
11592 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
11593 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
11594 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
11595 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
11596 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
11597 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
11598 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
11599
11600 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
11601 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
11602 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
11603 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
11604 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
11605 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
11606 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
11607 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
11608 *puDst = uDstOut;
11609}
11610
11611
11612/*
11613 * CRC32 (SEE 4.2).
11614 */
11615
11616IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
11617{
11618 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
11619}
11620
11621
11622IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
11623{
11624 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
11625}
11626
11627IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
11628{
11629 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
11630}
11631
11632IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
11633{
11634 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
11635}
11636
11637
11638/*
11639 * PTEST (SSE 4.1) - special as it output only EFLAGS.
11640 */
11641#ifdef IEM_WITHOUT_ASSEMBLY
11642IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
11643{
11644 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
11645 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
11646 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
11647 fEfl |= X86_EFL_ZF;
11648 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
11649 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
11650 fEfl |= X86_EFL_CF;
11651 *pfEFlags = fEfl;
11652}
11653#endif
11654
11655IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
11656{
11657 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
11658 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
11659 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
11660 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
11661 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
11662 fEfl |= X86_EFL_ZF;
11663 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
11664 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
11665 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
11666 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
11667 fEfl |= X86_EFL_CF;
11668 *pfEFlags = fEfl;
11669}
11670
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette