IEMAllAImpl-arm64.S@ 104301

Last change on this file since 104301 was 104301, checked in by vboxsync, 8 months ago
VMM/IEM: Shortened down the 8-bit and 16-bit ROL ARM assembly a little. bugref:10376
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 40.3 KB

Line
1	/* $Id: IEMAllAImpl-arm64.S 104301 2024-04-11 21:14:39Z vboxsync $ */
2	/** @file
3	* IEM - Instruction Implementation in Assembly, ARM64 variant.
4	*/
5
6	/*
7	* Copyright (C) 2023 Oracle and/or its affiliates.
8	*
9	* This file is part of VirtualBox base platform packages, as
10	* available from https://www.virtualbox.org.
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License
14	* as published by the Free Software Foundation, in version 3 of the
15	* License.
16	*
17	* This program is distributed in the hope that it will be useful, but
18	* WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	* General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, see <https://www.gnu.org/licenses>.
24	*
25	* SPDX-License-Identifier: GPL-3.0-only
26	*/
27
28
29	/*********************************************************************************************************************************
30	* Header Files *
31	*********************************************************************************************************************************/
32	#include <iprt/asmdefs-arm.h>
33	#include <iprt/x86.h>
34
35	#define IEM_AIMPL_FUNCTION_ALIGNMENT 0x20
36
37
38	#if RT_CLANG_PREREQ(15, 0)
39	.arch_extension flagm /* not necessary */
40	#else
41	/* clang 12.0.x defaults to apple-a12. M1 is more similar to A14, I guess.
42	For some reason the +crc make cfinv work (with clang 12). 'flagm' isn't
43	recognized, nor is the 'fmi' in the error message for cfinv. 'flagm'
44	work for v15 and is enabled by default it seems. */
45	# ifdef RT_OS_DARWIN
46	.cpu apple-a14+crc
47	# else
48	.cpu cortex-a53+flagm
49	# endif
50	#endif
51
52
53	.macro CALC_EFLAGS_PARITY, regEfl, regResult, regTmp
54	/*
55	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
56	*/
57	eor \regTmp, \regResult, \regResult, LSR #4
58	eor \regTmp, \regTmp, \regTmp, LSR #2
59	eor \regTmp, \regTmp, \regTmp, LSR #1
60	eor \regTmp, \regTmp, #1
61	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
62	.endm
63
64
65	.macro CALC_EFLAGS_AUX_CARRY, regEfl, regResult, regLeft, regRight, regTmp
66	/*
67	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
68	*/
69	eor \regTmp, \regLeft, \regRight
70	eor \regTmp, \regTmp, \regResult
71	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
72	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
73	.endm
74
75	.macro CALC_EFLAGS, regEfl, regResult, regLeft, regRight, regTmp, fSkipFlags=0
76	/*
77	* Translate the arm NZCV bits into corresponding EFLAGS bits.
78	*/
79	.if \fSkipFlags == 0 \|\| \fSkipFlags == X86_EFL_OF
80	#if 0
81	/* Maybe just a tiny bit slow than the next one. */
82	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
83	.ifeq \fSkipFlags & X86_EFL_OF
84	lsr \regTmp, \regTmp, #28
85	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
86	lsr \regTmp, \regTmp, #1
87	.else
88	lsr \regTmp, \regTmp, #29
89	.endif
90	eor \regTmp, \regTmp, #1 /* inverts the carry flag to x86 style. */
91	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
92	lsr \regTmp, \regTmp, #1
93	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
94	#else
95	/* This seems to be the faster one... */
96	cfinv
97	mrs \regTmp, NZCV /* [31] = N; [30] = Z; [29] = C; [29] = V */
98	.ifeq (\fSkipFlags & X86_EFL_OF)
99	lsr \regTmp, \regTmp, #28
100	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
101	lsr \regTmp, \regTmp, #1
102	.else
103	lsr \regTmp, \regTmp, #29
104	.endif
105	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
106	lsr \regTmp, \regTmp, #1
107	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
108	#endif
109	.else
110	/* Definitely slower than the above two, but easier to handle wrt skipping parts. */
111	.ifeq \fSkipFlags & X86_EFL_ZF
112	cset \regTmp, eq
113	bfi \regEfl, \regTmp, #X86_EFL_ZF_BIT, #1
114	.endif
115	.ifeq \fSkipFlags & X86_EFL_CF
116	cset \regTmp, cc
117	bfi \regEfl, \regTmp, #X86_EFL_CF_BIT, #1
118	.endif
119	.ifeq \fSkipFlags & X86_EFL_OF
120	cset \regTmp, vs
121	bfi \regEfl, \regTmp, #X86_EFL_OF_BIT, #1
122	.endif
123	.ifeq \fSkipFlags & X86_EFL_SF
124	cset \regTmp, mi
125	bfi \regEfl, \regTmp, #X86_EFL_SF_BIT, #1
126	.endif
127	.endif
128
129
130	/*
131	* Parity calculation for low byte of the result (sucks that there is no popcount for gprs).
132	*/
133	eor \regTmp, \regResult, \regResult, LSR #4
134	eor \regTmp, \regTmp, \regTmp, LSR #2
135	eor \regTmp, \regTmp, \regTmp, LSR #1
136	eor \regTmp, \regTmp, #1
137	bfi \regEfl, \regTmp, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
138
139	/*
140	* Auxilary carry / borrow flag. This is related to 8-bit BCD.
141	*/
142	eor \regTmp, \regLeft, \regRight
143	eor \regTmp, \regTmp, \regResult
144	lsr \regTmp, \regTmp, #X86_EFL_AF_BIT
145	bfi \regEfl, \regTmp, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w1 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
146
147	/* done */
148	.endm
149
150
151	BEGINCODE
152
153
154
155	/* Some sketches.
156
157	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg));
158	BEGINPROC_HIDDEN iemAImpl_xchg_u8_locked
159	ldrb w2, [x1]
160	swpalb w2, w2, [x0]
161	strb w2, [x1]
162	ret
163
164	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t pu16Mem, uint16_t pu16Reg));
165	BEGINPROC_HIDDEN iemAImpl_xchg_u16_locked
166	ldrh w2, [x1]
167	swpalh w2, w2, [x0]
168	strh w2, [x1]
169	ret
170
171	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t pu32Mem, uint32_t pu32Reg));
172	// IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t pu64Mem, uint64_t pu64Reg));
173
174	*/
175
176
177	/* IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked, (uint8_t pu8Mem, uint8_t pu8Reg)); */
178
179	/*
180	* The CMP instruction.
181	*/
182
183	/* uint32_t iemAImpl_cmp_u8(uint32_t fEFlags, uint8_t const puDst, uint8_t uSrc); /
184	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
185	BEGINPROC_HIDDEN iemAImpl_sub_u8
186	.cfi_startproc
187	/* Do the subtraction. */
188	ldrb w8, [x1]
189	/and w2, w2, #0xff - should not be necessary. /
190	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
191	strb w9, [x1]
192	setf8 w9
193
194	/* Calculate EFLAGS (passed in and returned via x0). */
195	and w9, w9, #0xffff
196	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
197
198	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
199	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
200	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
201	eor w12, w8, w9
202	and w11, w12, w11
203	lsr w11, w11, #7
204	bfi w0, w11, #X86_EFL_OF_BIT, #1
205
206	ret
207	.cfi_endproc
208
209
210	/* uint32_t iemAImpl_cmp_u16(uint32_t fEFlags, uint16_t const puDst, uint16_t uSrc); /
211	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
212	BEGINPROC_HIDDEN iemAImpl_sub_u16
213	.cfi_startproc
214	/* Do the subtraction. */
215	ldrh w8, [x1]
216	/and w2, w2, #0xffff - should not be necessary. /
217	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
218	setf16 w9
219	strh w9, [x1]
220
221	/* Calculate EFLAGS (passed in and returned via x0). */
222	and w9, w9, #0xffff
223	CALC_EFLAGS x0, x9, x8, x2, x11, X86_EFL_OF
224
225	/* The overflow flag calc done by setf16 isn't correct for subtraction, so we have to
226	figure it out ourselves. (See IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC for details.) */
227	eor w11, w8, w2 /* input dst ^ source (simplified from ~(dst ^ (source ^ 0x8000)) ). */
228	eor w12, w8, w9
229	and w11, w12, w11
230	lsr w11, w11, #15
231	bfi w0, w11, #X86_EFL_OF_BIT, #1
232
233	ret
234	.cfi_endproc
235
236
237	/* uint32_t iemAImpl_cmp_u32(uint32_t fEFlags, uint32_t const puDst, uint32_t uSrc); /
238	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
239	BEGINPROC_HIDDEN iemAImpl_sub_u32
240	.cfi_startproc
241	/* Do the subtraction. */
242	ldr w8, [x1]
243	subs w9, w8, w2 /* w9 = w8 (puDst) - w2 (uSrc) /
244	str w9, [x1]
245
246	/* Calculate EFLAGS (passed in and returned via x0). */
247
248	#if 0
249	/* Translate the arm NZCV bits into corresponding EFLAGS bits. */
250	#if 0 /* maybe just a tiny bit slow than the next one. */
251	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
252	lsr w11, w11, #28
253	bfi w0, w11, #X86_EFL_OF_BIT, #1
254	lsr w11, w11, #1
255	eor w11, w11, #1 /* inverts the carry flag to x86 style. */
256	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
257	lsr w11, w11, #1
258	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
259	#elif 1 /* seems the faster one... */
260	cfinv
261	mrs x11, NZCV /* w11[31] = N; w11[30] = Z; w11[29] = C; w11[29] = V */
262	lsr w11, w11, #28
263	bfi w0, w11, #X86_EFL_OF_BIT, #1
264	lsr w11, w11, #1
265	bfi w0, w11, #X86_EFL_CF_BIT, #1 /* CF(0) = C */
266	lsr w11, w11, #1
267	bfi w0, w11, #X86_EFL_ZF_BIT, #2 /* SF(7),ZF(6) = NZ */
268	#else
269	cset w11, eq
270	bfi w0, w11, #X86_EFL_ZF_BIT, #1
271	cset w11, cc
272	bfi w0, w11, #X86_EFL_CF_BIT, #1
273	cset w11, vs
274	bfi w0, w11, #X86_EFL_OF_BIT, #1
275	cset w11, mi
276	bfi w0, w11, #X86_EFL_SF_BIT, #1
277	#endif
278
279	/* Parity calculation for low byte of the result (sucks that there is no popcount for gprs). */
280	eor w11, w9, w9, LSR #4
281	eor w11, w11, w11, LSR #2
282	eor w11, w11, w11, LSR #1
283	eor w11, w11, #1
284	bfi w0, w11, #X86_EFL_PF_BIT, #1 /* PF(2) = popcount(w9 & 0xff) & 1 ^ 1 */
285
286	/* Auxilary carry / borrow flag. This is related to 8-bit BCD. */
287	eor w11, w8, w2
288	eor w11, w11, w9
289	lsr w11, w11, #X86_EFL_AF_BIT
290	bfi w0, w11, #X86_EFL_AF_BIT, #1 /* AF(4) = (w8 ^ w2 ^ w9 & X86_EFL_AF) >> X86_EFL_AF_BIT */
291	#else
292	CALC_EFLAGS x0, x9, x8, x2, x11
293	#endif
294
295	ret
296	.cfi_endproc
297
298
299	/* uint32_t iemAImpl_cmp_u64(uint32_t fEFlags, uint64_t const puDst, uint64_t uSrc); /
300	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
301	BEGINPROC_HIDDEN iemAImpl_sub_u64
302	.cfi_startproc
303	/* Do the subtraction. */
304	ldr x8, [x1]
305	subs x9, x8, x2 /* x9 = x8 (puDst) - x2 (uSrc) /
306	str x9, [x1]
307
308	/* Calculate EFLAGS (passed in and returned via x0). */
309	CALC_EFLAGS x0, x9, x8, x2, x11
310
311	ret
312	.cfi_endproc
313
314
315
316	/*
317	* Shift Left.
318	*/
319
320	/* uint32_t iemAImpl_shl_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
321	/* uint32_t iemAImpl_shl_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
322	/* uint32_t iemAImpl_shl_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
323	.macro SHL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
324	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
325	BEGINPROC_HIDDEN \a_Name
326	.cfi_startproc
327
328	/* Do we need to shift anything at all? */
329	and w2, w2, #0x1f
330	cbz w2, 99f
331
332	/*
333	* Do the shifting
334	*/
335	ldr\a_LdStSuff w8, [x1]
336	.ifne \a_cBits < 32
337	lslv w9, w8, w2
338	.else
339	lslv x9, x8, x2 /* use 64-bit registers here so we get CF for free. We know x1 != 0. */
340	.endif
341	str\a_LdStSuff w9, [x1]
342
343	/*
344	* Calculate EFLAGS.
345	*/
346	CALC_EFLAGS_PARITY w0, w9, w12
347
348	.ifne \a_cBits < 32
349	setf\a_cBits w9 /* Sets NZ */
350	.else
351	ands wzr, w9, w9 /* Sets NZ */
352	.endif
353	#if 1
354	mrs x11, NZCV
355	lsr w11, w11, #30 /* N=1; Z=0 */
356	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
357	#else
358	cset x11, eq
359	bfi w0, w11, X86_EFL_ZF_BIT, 1
360	cset x12, pl
361	bfi w0, w12, X86_EFL_SF_BIT, 1
362	#endif
363
364	.ifne \a_cBits < 32
365	bfxil w0, w9, #\a_cBits, #1 /* w9 bit 8/16 contains carry. (X86_EFL_CF_BIT == 0) */
366	.else
367	bfxil x0, x9, #\a_cBits, #1 /* x9 bit 32 contains carry. (X86_EFL_CF_BIT == 0) */
368	.endif
369
370	.ifne \a_fIntelFlags
371	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
372	eor w11, w8, w8, LSL #1
373	lsr w11, w11, #(\a_cBits - 1)
374	bfi w0, w11, #X86_EFL_OF_BIT, #1
375
376	and w0, w0, ~X86_EFL_AF /* AF is cleared */
377	.else
378	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
379	.ifne \a_cBits < 32
380	eor w11, w9, w9, LSR #1
381	lsr w11, w11, #(\a_cBits - 1)
382	.else
383	eor x11, x9, x9, LSR #1
384	lsr x11, x11, #(\a_cBits - 1)
385	.endif
386	bfi w0, w11, #X86_EFL_OF_BIT, #1
387
388	orr w0, w0, X86_EFL_AF /* AF is set */
389	.endif
390
391	99:
392	ret
393	.cfi_endproc
394	.endm
395
396	SHL_8_16_32 iemAImpl_shl_u8, 8, 1, b
397	SHL_8_16_32 iemAImpl_shl_u8_intel, 8, 1, b
398	SHL_8_16_32 iemAImpl_shl_u8_amd, 8, 0, b
399
400	SHL_8_16_32 iemAImpl_shl_u16, 16, 1, h
401	SHL_8_16_32 iemAImpl_shl_u16_intel, 16, 1, h
402	SHL_8_16_32 iemAImpl_shl_u16_amd, 16, 0, h
403
404	SHL_8_16_32 iemAImpl_shl_u32, 32, 1,
405	SHL_8_16_32 iemAImpl_shl_u32_intel, 32, 1,
406	SHL_8_16_32 iemAImpl_shl_u32_amd, 32, 0,
407
408	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
409	/* uint32_t iemAImpl_shl_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
410	.macro SHL_64, a_Name, a_fIntelFlags
411	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
412	BEGINPROC_HIDDEN \a_Name
413	.cfi_startproc
414
415	/* Do we need to shift anything at all? */
416	and w2, w2, #0x3f
417	cbz w2, 99f
418
419	/*
420	* Do the shifting
421	*/
422	ldr x8, [x1]
423	lslv x9, x8, x2
424	str x9, [x1]
425
426	/*
427	* Calculate EFLAGS.
428	*/
429	CALC_EFLAGS_PARITY w0, w9, w11
430
431	ands xzr, x9, x9 /* Sets NZ */
432	mrs x11, NZCV
433	lsr w11, w11, #30 /* N=1; Z=0 */
434	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
435
436	neg w11, w2 /* the shift count is MODed by the data size, so this is safe. */
437	lsrv x11, x8, x11
438	bfi w0, w11, X86_EFL_CF_BIT, 1
439
440	.ifne \a_fIntelFlags
441	/* Intel: OF = first bit shifted: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
442	eor x11, x8, x8, LSL #1
443	lsr x11, x11, #63
444	bfi w0, w11, #X86_EFL_OF_BIT, #1
445
446	and w0, w0, ~X86_EFL_AF /* AF is cleared */
447	.else
448	/* AMD: OF = last bit shifted: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
449	eor x11, x11, x9, LSR #63 /* w11[0]=CF from above */
450	bfi w0, w11, #X86_EFL_OF_BIT, #1
451
452	orr w0, w0, X86_EFL_AF /* AF is set */
453	.endif
454	99:
455	ret
456	.cfi_endproc
457	.endm
458
459	SHL_64 iemAImpl_shl_u64, 1
460	SHL_64 iemAImpl_shl_u64_intel, 1
461	SHL_64 iemAImpl_shl_u64_amd, 0
462
463
464	/*
465	* Shift Right, Unsigned.
466	*/
467
468	/* uint32_t iemAImpl_shr_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
469	/* uint32_t iemAImpl_shr_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
470	/* uint32_t iemAImpl_shr_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
471	.macro shr_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
472	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
473	BEGINPROC_HIDDEN \a_Name
474	.cfi_startproc
475
476	/* Do we need to shift anything at all? */
477	and w2, w2, #0x1f
478	cbz w2, 99f
479
480	/*
481	* Do the shifting.
482	*/
483	ldr\a_LdStSuff w8, [x1]
484	lsrv w9, w8, w2
485	str\a_LdStSuff w9, [x1]
486
487	/*
488	* Calculate EFLAGS.
489	*/
490	sub w11, w2, #1
491	lsrv w11, w8, w11
492	bfxil w0, w11, #X86_EFL_CF_BIT, #1
493
494	.ifne \a_fIntelFlags
495	and w0, w0, ~X86_EFL_AF /* AF is cleared */
496	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
497	lsr w11, w8, #(\a_cBits - 1)
498	bfi w0, w11, #X86_EFL_OF_BIT, #1
499	.else
500	orr w0, w0, X86_EFL_AF /* AF is set */
501	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
502	lsr w11, w9, #(\a_cBits - 2)
503	bfi w0, w11, #X86_EFL_OF_BIT, #1
504	.endif
505
506	CALC_EFLAGS_PARITY w0, w9, w11
507
508	.ifne \a_cBits < 32
509	setf\a_cBits w9 /* Sets NZ */
510	.else
511	ands wzr, w9, w9 /* Sets NZ */
512	.endif
513	mrs x11, NZCV
514	lsr w11, w11, #30 /* N=1; Z=0 */
515	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
516
517	99:
518	ret
519	.cfi_endproc
520	.endm
521
522	shr_8_16_32 iemAImpl_shr_u8, 8, 1, b
523	shr_8_16_32 iemAImpl_shr_u8_intel, 8, 1, b
524	shr_8_16_32 iemAImpl_shr_u8_amd, 8, 0, b
525
526	shr_8_16_32 iemAImpl_shr_u16, 16, 1, h
527	shr_8_16_32 iemAImpl_shr_u16_intel, 16, 1, h
528	shr_8_16_32 iemAImpl_shr_u16_amd, 16, 0, h
529
530	shr_8_16_32 iemAImpl_shr_u32, 32, 1,
531	shr_8_16_32 iemAImpl_shr_u32_intel, 32, 1,
532	shr_8_16_32 iemAImpl_shr_u32_amd, 32, 0,
533
534	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
535	/* void iemAImpl_shr_u64(uint16_t pu64Dst, uint8_t cShift, uint32_t pEFlags); */
536	.macro shr_64, a_Name, a_fIntelFlags
537	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
538	BEGINPROC_HIDDEN \a_Name
539	.cfi_startproc
540
541	/* Do we need to shift anything at all? */
542	ands w2, w2, #0x3f
543	b.eq 99f
544
545	/*
546	* Do the shifting
547	*/
548	ldr x8, [x1]
549	lsrv x9, x8, x2
550	str x9, [x1]
551
552	/*
553	* Calculate EFLAGS.
554	*/
555	sub w11, w2, #1
556	lsrv x11, x8, x11
557	bfxil w0, w11, #X86_EFL_CF_BIT, #1
558
559	.ifne \a_fIntelFlags
560	and w0, w0, ~X86_EFL_AF /* AF is cleared */
561	/* Intel: OF = one bit shift: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDstIn); */
562	lsr x11, x8, #63
563	bfi w0, w11, #X86_EFL_OF_BIT, #1
564	.else
565	orr w0, w0, X86_EFL_AF /* AF is set */
566	/* AMD: OF = last bits shifted: fEfl \|= (uResult >> (cOpBits - 2)) << X86_EFL_OF_BIT; */
567	lsr x11, x9, #62
568	bfi w0, w11, #X86_EFL_OF_BIT, #1
569	.endif
570
571	CALC_EFLAGS_PARITY w0, w9, w11
572
573	ands xzr, x9, x9 /* Sets NZ */
574	mrs x11, NZCV
575	lsr w11, w11, #30 /* N=1; Z=0 */
576	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
577
578	99:
579	ret
580	.cfi_endproc
581	.endm
582
583	shr_64 iemAImpl_shr_u64, 1
584	shr_64 iemAImpl_shr_u64_intel, 1
585	shr_64 iemAImpl_shr_u64_amd, 0
586
587
588	/*
589	* Shift Right, Signed
590	*/
591
592	/* uint32_t iemAImpl_sar_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
593	/* uint32_t iemAImpl_sar_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
594	/* uint32_t iemAImpl_sar_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
595	.macro sar_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdSuff, a_StSuff
596	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
597	BEGINPROC_HIDDEN \a_Name
598	.cfi_startproc
599
600	/* Do we need to shift anything at all? */
601	and w2, w2, #0x1f
602	cbz w2, 99f
603
604	/*
605	* Do the shifting.
606	*/
607	ldr\a_LdSuff w8, [x1] /* Sign-extending for 8 and 16 bits! */
608	asrv w9, w8, w2
609	str\a_StSuff w9, [x1]
610
611	/*
612	* Calculate EFLAGS.
613	*/
614	sub w11, w2, #1
615	lsrv w11, w8, w11
616	bfxil w0, w11, #X86_EFL_CF_BIT, #1
617
618	.ifne \a_fIntelFlags
619	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
620	and w0, w0, w11 /* AF and OF are cleared */
621	.else
622	orr w0, w0, X86_EFL_AF /* AF is set */
623	and w0, w0, ~X86_EFL_OF /* OF is cleared */
624	.endif
625
626	CALC_EFLAGS_PARITY w0, w9, w11
627
628	.ifne \a_cBits < 32
629	setf\a_cBits w9 /* Sets NZ */
630	.else
631	ands wzr, w9, w9 /* Sets NZ */
632	.endif
633	mrs x11, NZCV
634	lsr w11, w11, #30 /* N=1; Z=0 */
635	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
636
637	99:
638	ret
639	.cfi_endproc
640	.endm
641
642	sar_8_16_32 iemAImpl_sar_u8, 8, 1, sb, b
643	sar_8_16_32 iemAImpl_sar_u8_intel, 8, 1, sb, b
644	sar_8_16_32 iemAImpl_sar_u8_amd, 8, 0, sb, b
645
646	sar_8_16_32 iemAImpl_sar_u16, 16, 1, sh, h
647	sar_8_16_32 iemAImpl_sar_u16_intel, 16, 1, sh, h
648	sar_8_16_32 iemAImpl_sar_u16_amd, 16, 0, sh, h
649
650	sar_8_16_32 iemAImpl_sar_u32, 32, 1, ,
651	sar_8_16_32 iemAImpl_sar_u32_intel, 32, 1, ,
652	sar_8_16_32 iemAImpl_sar_u32_amd, 32, 0, ,
653
654	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
655	/* uint32_t iemAImpl_sar_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
656	.macro sar_64, a_Name, a_fIntelFlags
657	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
658	BEGINPROC_HIDDEN \a_Name
659	.cfi_startproc
660
661	/* Do we need to shift anything at all? */
662	ands w2, w2, #0x3f
663	b.eq 99f
664
665	/*
666	* Do the shifting
667	*/
668	ldr x8, [x1]
669	asrv x9, x8, x2
670	str x9, [x1]
671
672	/*
673	* Calculate EFLAGS.
674	*/
675	sub w11, w2, #1
676	lsrv x11, x8, x11
677	bfxil w0, w11, #X86_EFL_CF_BIT, #1
678
679	.ifne \a_fIntelFlags
680	mov w11, ~(X86_EFL_AF \| X86_EFL_OF)
681	and w0, w0, w11 /* AF and OF are cleared */
682	.else
683	orr w0, w0, X86_EFL_AF /* AF is set */
684	and w0, w0, ~X86_EFL_OF /* OF is cleared */
685	.endif
686
687	CALC_EFLAGS_PARITY w0, w9, w11
688
689	ands xzr, x9, x9 /* Sets NZ */
690	mrs x11, NZCV
691	lsr w11, w11, #30 /* N=1; Z=0 */
692	bfi w0, w11, X86_EFL_ZF_BIT, 2 /* EFLAGS.ZF and EFLAGS.SF */
693
694	99:
695	ret
696	.cfi_endproc
697	.endm
698
699	sar_64 iemAImpl_sar_u64, 1
700	sar_64 iemAImpl_sar_u64_intel, 1
701	sar_64 iemAImpl_sar_u64_amd, 0
702
703
704	/*
705	* Rotate Left.
706	*/
707
708	/* uint32_t iemAImpl_rol_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
709	/* uint32_t iemAImpl_rol_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
710	/* uint32_t iemAImpl_rol_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
711	.macro ROL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
712	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
713	BEGINPROC_HIDDEN \a_Name
714	.cfi_startproc
715
716	/* Do we need to rotate anything at all? */
717	and w2, w2, #0x1f
718	cbz w2, 99f
719
720	/*
721	* Do the shifting
722	*/
723	.ifne \a_cBits < 32
724	and w2, w2, #(\a_cBits - 1)
725	neg w3, w2 /* the count is MODed by the data size, so this is safe. */
726	ldr\a_LdStSuff w8, [x1]
727	orr w8, w8, w8, LSL #(32 - \a_cBits) /* place a copy of the value at the top of the register, ready to be roated in */
728	rorv w9, w8, w2
729	str\a_LdStSuff w9, [x1]
730	.else
731	neg w3, w2 /* the count is MODed by the data size, so this is safe. */
732	ldr\a_LdStSuff w8, [x1]
733	rorv w9, w8, w3
734	str\a_LdStSuff w9, [x1]
735	.endif
736
737	/*
738	* Calculate EFLAGS - only CF and OF.
739	*/
740	bfi w0, w9, #0, #1 /* CF = last bit rotated around (new bottom bit) */
741
742	.ifne \a_fIntelFlags
743	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
744	eor w11, w8, w8, LSL #1
745	lsr w11, w11, #(\a_cBits - 1)
746	bfi w0, w11, #X86_EFL_OF_BIT, #1
747	.else
748	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
749	eor w11, w0, w9, LSR #(\a_cBits - 1)
750	bfi w0, w11, #X86_EFL_OF_BIT, #1
751	.endif
752
753	99:
754	ret
755	.cfi_endproc
756	.endm
757
758	ROL_8_16_32 iemAImpl_rol_u8, 8, 1, b
759	ROL_8_16_32 iemAImpl_rol_u8_intel, 8, 1, b
760	ROL_8_16_32 iemAImpl_rol_u8_amd, 8, 0, b
761
762	ROL_8_16_32 iemAImpl_rol_u16, 16, 1, h
763	ROL_8_16_32 iemAImpl_rol_u16_intel, 16, 1, h
764	ROL_8_16_32 iemAImpl_rol_u16_amd, 16, 0, h
765
766	ROL_8_16_32 iemAImpl_rol_u32, 32, 1,
767	ROL_8_16_32 iemAImpl_rol_u32_intel, 32, 1,
768	ROL_8_16_32 iemAImpl_rol_u32_amd, 32, 0,
769
770	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
771	/* uint32_t iemAImpl_rol_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
772	.macro ROL_64, a_Name, a_fIntelFlags
773	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
774	BEGINPROC_HIDDEN \a_Name
775	.cfi_startproc
776
777	/* Do we need to shift anything at all? */
778	and w2, w2, #0x3f
779	cbz w2, 99f
780
781	/*
782	* Do the shifting
783	*/
784	neg w3, w2
785	ldr x8, [x1]
786	rorv x9, x8, x3
787	str x9, [x1]
788
789	/*
790	* Calculate EFLAGS - only CF and OF.
791	*/
792	bfi w0, w9, #0, #1 /* CF = last bit rotated around */
793
794	.ifne \a_fIntelFlags
795	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
796	eor x11, x8, x8, LSL #1
797	lsr x11, x11, #(64 - 1)
798	bfi w0, w11, #X86_EFL_OF_BIT, #1
799	.else
800	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
801	eor x11, x0, x9, LSR #(64 - 1)
802	bfi w0, w11, #X86_EFL_OF_BIT, #1
803	.endif
804
805	99:
806	ret
807	.cfi_endproc
808	.endm
809
810	ROL_64 iemAImpl_rol_u64, 1
811	ROL_64 iemAImpl_rol_u64_intel, 1
812	ROL_64 iemAImpl_rol_u64_amd, 0
813
814
815	/*
816	* Rotate Right.
817	*/
818
819	/* uint32_t iemAImpl_ror_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
820	/* uint32_t iemAImpl_ror_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
821	/* uint32_t iemAImpl_ror_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
822	.macro ROR_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
823	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
824	BEGINPROC_HIDDEN \a_Name
825	.cfi_startproc
826
827	/* Do we need to rotate anything at all? */
828	and w2, w2, #0x1f
829	cbz w2, 99f
830
831	/*
832	* Do the shifting
833	*/
834	.ifne \a_cBits < 32
835	and w2, w2, #(\a_cBits - 1)
836	ldr\a_LdStSuff w8, [x1]
837	orr w8, w8, w8, LSL #(\a_cBits) /* duplicate value above, so it is ready to be shifted in. */
838	lsrv w9, w8, w2
839	str\a_LdStSuff w9, [x1]
840	.else
841	ldr\a_LdStSuff w8, [x1]
842	rorv w9, w8, w2
843	str\a_LdStSuff w9, [x1]
844	.endif
845
846	/*
847	* Calculate EFLAGS - only CF and OF.
848	*/
849	bfxil w0, w9, #(\a_cBits - 1), #1 /* CF = last bit rotated around (new top bit) */
850
851	.ifne \a_fIntelFlags
852	/* Intel: OF = first rotate step: X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); */
853	eor w11, w8, w8, LSR #(\a_cBits - 1)
854	bfi w0, w11, #X86_EFL_OF_BIT, #1
855	.else
856	/* AMD: OF = last rotate step: fEFlags \|= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; */
857	eor w11, w0, w9, LSR #(\a_cBits - 2)
858	bfi w0, w11, #X86_EFL_OF_BIT, #1
859	.endif
860
861	99:
862	ret
863	.cfi_endproc
864	.endm
865
866	ROR_8_16_32 iemAImpl_ror_u8, 8, 1, b
867	ROR_8_16_32 iemAImpl_ror_u8_intel, 8, 1, b
868	ROR_8_16_32 iemAImpl_ror_u8_amd, 8, 0, b
869
870	ROR_8_16_32 iemAImpl_ror_u16, 16, 1, h
871	ROR_8_16_32 iemAImpl_ror_u16_intel, 16, 1, h
872	ROR_8_16_32 iemAImpl_ror_u16_amd, 16, 0, h
873
874	ROR_8_16_32 iemAImpl_ror_u32, 32, 1,
875	ROR_8_16_32 iemAImpl_ror_u32_intel, 32, 1,
876	ROR_8_16_32 iemAImpl_ror_u32_amd, 32, 0,
877
878	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
879	/* uint32_t iemAImpl_ror_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
880	.macro ROR_64, a_Name, a_fIntelFlags
881	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
882	BEGINPROC_HIDDEN \a_Name
883	.cfi_startproc
884
885	/* Do we need to shift anything at all? */
886	and w2, w2, #0x3f
887	cbz w2, 99f
888
889	/*
890	* Do the shifting
891	*/
892	ldr x8, [x1]
893	rorv x9, x8, x2
894	str x9, [x1]
895
896	/*
897	* Calculate EFLAGS - only CF and OF.
898	*/
899	bfxil x0, x9, #(64 - 1), #1 /* CF = last bit rotated around (new top bit) */
900
901	.ifne \a_fIntelFlags
902	/* Intel: OF = first rotate step: X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); */
903	eor x11, x8, x8, LSR #(64 - 1)
904	bfi w0, w11, #X86_EFL_OF_BIT, #1
905	.else
906	/* AMD: OF = last rotate step: fEFlags \|= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; */
907	eor x11, x0, x9, LSR #(64 - 2)
908	bfi w0, w11, #X86_EFL_OF_BIT, #1
909	.endif
910
911	99:
912	ret
913	.cfi_endproc
914	.endm
915
916	ROR_64 iemAImpl_ror_u64, 1
917	ROR_64 iemAImpl_ror_u64_intel, 1
918	ROR_64 iemAImpl_ror_u64_amd, 0
919
920
921	/*
922	* Rotate Left thru Carry.
923	*/
924
925	/* uint32_t iemAImpl_rcl_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
926	/* uint32_t iemAImpl_rcl_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
927	/* uint32_t iemAImpl_rcl_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
928	.macro RCL_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
929	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
930	BEGINPROC_HIDDEN \a_Name
931	.cfi_startproc
932
933	/* Do we need to rotate anything at all? */
934	and w2, w2, #0x1f
935	.ifne \a_cBits >= 32
936	cbz w2, 99f
937	.else
938	.ifeq \a_fIntelFlags
939	cbz w2, 99f /* AMD */
940	.endif
941
942	/*
943	* 8 and 16 bit: w2 = w2 % (a_cBits + 1).
944	*
945	* Given that the w2 range is 0 thru 31, the 16-bit case can be reduced
946	* to:
947	* w2 = w2 >= 17 ? w2 - 17 : w2
948	*
949	* In the 8-bit scenario we're modding with 9, so we need to do it in
950	* two steps:
951	* w2 = w2 >= 18 ? w2 - 18 : w2
952	* w2 = w2 >= 9 ? w2 - 9 : w2
953	*
954	* For comparison clang generates the following for 16-bit:
955	* mov w9, #0xf0f0f0f1
956	* umull x9, w2, w9
957	* lsr x9, x9, #36
958	* bfi w9, w9, #4, #1
959	* sub w2, w2, w9
960	*
961	* The 8-bit variant is differs only in the constants used:
962	* mov w9, #0x38e38e39
963	* umull x9, w2, w9
964	* lsr x9, x9, #33
965	* bfi w9, w9, #3, #2
966	* subs w8, w2, w9
967	*/
968	.ifne \a_cBits == 16
969	subs w3, w2, #17
970	csel w2, w3, w2, hs
971	.else
972	subs w3, w2, #18
973	csel w2, w3, w2, hs
974	subs w3, w2, #9
975	csel w2, w3, w2, hs
976	.endif
977	.ifne \a_fIntelFlags
978	cbz w2, 99f /* Intel: Skip everything if the modded rotate count is zero. */
979	.endif
980	.endif
981
982	/*
983	* Do the rotating: x9 = RORV(w8[0:a_cBits-1] \| (CF << 63) \| (w8[1:a_cBits-1] << (64-a_cBits-1)) \| (CF << a_cBits)), -w2)
984	*/
985	neg w2, w2 /* w3 = rorv count - this will be masked by 0x3f so it's the same as 64-w2. */
986
987	ldr\a_LdStSuff w8, [x1]
988	.ifne \a_cBits < 32
989	orr x8, x8, x8, LSL #(64 - \a_cBits - 1)
990	.ifeq \a_fIntelFlags
991	bfi x8, x0, #(\a_cBits), #1 /* AMD: w8[a_cBits] = CF; Avoids conditional branch for CF calc to cover cShift==0. */
992	.endif
993	.else
994	lsr w9, w8, #1
995	orr x8, x8, x9, LSL #(64 - \a_cBits)
996	.endif
997	bfi x8, x0, #63, #1 /* w8[63] = CF */
998	rorv x9, x8, x2
999	str\a_LdStSuff w9, [x1]
1000
1001	/*
1002	* Calculate EFLAGS - only CF and OF.
1003	*/
1004	bfxil x0, x9, #(\a_cBits), #1 /* CF = last bit rotated 'out' */
1005
1006	.ifne \a_fIntelFlags
1007	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1008	eor w11, w8, w8, LSL #1
1009	lsr w11, w11, #(\a_cBits - 1)
1010	bfi w0, w11, #X86_EFL_OF_BIT, #1
1011	.else
1012	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1013	eor w11, w0, w9, LSR #(\a_cBits - 1)
1014	bfi w0, w11, #X86_EFL_OF_BIT, #1
1015	.endif
1016
1017	99:
1018	ret
1019	.cfi_endproc
1020	.endm
1021
1022	RCL_8_16_32 iemAImpl_rcl_u8, 8, 1, b
1023	RCL_8_16_32 iemAImpl_rcl_u8_intel, 8, 1, b
1024	RCL_8_16_32 iemAImpl_rcl_u8_amd, 8, 0, b
1025
1026	RCL_8_16_32 iemAImpl_rcl_u16, 16, 1, h
1027	RCL_8_16_32 iemAImpl_rcl_u16_intel, 16, 1, h
1028	RCL_8_16_32 iemAImpl_rcl_u16_amd, 16, 0, h
1029
1030	RCL_8_16_32 iemAImpl_rcl_u32, 32, 1,
1031	RCL_8_16_32 iemAImpl_rcl_u32_intel, 32, 1,
1032	RCL_8_16_32 iemAImpl_rcl_u32_amd, 32, 0,
1033
1034	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
1035	/* uint32_t iemAImpl_rcl_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
1036	.macro RCL_64, a_Name, a_fIntelFlags
1037	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1038	BEGINPROC_HIDDEN \a_Name
1039	.cfi_startproc
1040
1041	/* Do we need to shift anything at all? */
1042	and w2, w2, #0x3f
1043	cbz w2, 99f /** @todo eliminate this for < 32 shift with intel flags */
1044
1045	/*
1046	* Do the rotating: (w8 << w2) \| (CF << (w2 - 1)) \| (w2 > 1 ? (w8 >> (64 - w2 + 1)) : 0)
1047	*/
1048	and w3, w0, #X86_EFL_CF
1049	subs w4, w2, #1 /* Also: prep for 'w2 > 1' (w2 can't be zero, btw) - think: cmp w2, #1 */
1050	lslv x3, x3, x4 /* x3 = CF << (w2 - 1) */
1051
1052	mov w4, #(64 + 1)
1053	sub w4, w4, w2 /* w4 = 64 - w2 + 1 */
1054
1055	ldr x8, [x1]
1056	lslv x9, x8, x2
1057	lsrv x10, x8, x4
1058	csel x10, xzr, x10, eq /* if w2 == 1: x10 = 0; else: x10 = x8 >> (64 - w2 + 1); */
1059	orr x9, x9, x3 /* shifted CF */
1060	orr x9, x9, x10
1061	str x9, [x1]
1062
1063	/*
1064	* Calculate EFLAGS - only CF and OF.
1065	*/
1066	neg x11, x2
1067	lsr x11, x8, x11
1068	bfi w0, w11, #0, #1 /* CF = last bit rotated out. */
1069
1070	.ifne \a_fIntelFlags
1071	/* Intel: OF = first rotate step: fEfl \|= X86_EFL_GET_OF_ ## cOpBits(uDst ^ (uDst << 1)); */
1072	eor x11, x8, x8, LSL #1
1073	lsr x11, x11, #(64 - 1)
1074	bfi w0, w11, #X86_EFL_OF_BIT, #1
1075	.else
1076	/* AMD: OF = last rotate step: fEfl \|= ((uResult >> (cOpBits - 1)) ^ fCarry) << X86_EFL_OF_BIT; */
1077	eor x11, x0, x9, LSR #(64 - 1)
1078	bfi w0, w11, #X86_EFL_OF_BIT, #1
1079	.endif
1080
1081	99:
1082	ret
1083	.cfi_endproc
1084	.endm
1085
1086	RCL_64 iemAImpl_rcl_u64, 1
1087	RCL_64 iemAImpl_rcl_u64_intel, 1
1088	RCL_64 iemAImpl_rcl_u64_amd, 0
1089
1090
1091	/*
1092	* Rotate Right thru Carry.
1093	*/
1094
1095	/* uint32_t iemAImpl_rcr_u8( uint32_t fEFlagsIn, uint8_t pu8Dst, uint8_t cShift); /
1096	/* uint32_t iemAImpl_rcr_u16(uint32_t fEFlagsIn, uint16_t pu16Dst, uint8_t cShift); /
1097	/* uint32_t iemAImpl_rcr_u32(uint32_t fEFlagsIn, uint16_t pu32Dst, uint8_t cShift); /
1098	.macro RCR_8_16_32, a_Name, a_cBits, a_fIntelFlags, a_LdStSuff
1099	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1100	BEGINPROC_HIDDEN \a_Name
1101	.cfi_startproc
1102
1103	/* Do we need to rotate anything at all? */
1104	and w2, w2, #0x1f
1105	.ifne \a_cBits >= 32
1106	cbz w2, 99f
1107	.else
1108	.ifeq \a_fIntelFlags
1109	cbz w2, 99f /* AMD */
1110	.endif
1111
1112	/*
1113	* 8 and 16 bit: w2 = w2 % (a_cBits + 1). (See RCL for details.)
1114	*/
1115	.ifne \a_cBits == 16
1116	subs w3, w2, #17
1117	csel w2, w3, w2, hs
1118	.else
1119	subs w3, w2, #18
1120	csel w2, w3, w2, hs
1121	subs w3, w2, #9
1122	csel w2, w3, w2, hs
1123	.endif
1124	.ifne \a_fIntelFlags
1125	cbz w2, 99f /* Intel: Skip everything if the modded rotate count is zero. */
1126	.endif
1127	.endif
1128
1129	/*
1130	* Do the rotating: x9 = RORV(x8[0:a_cBits-1] \| (CF << a_cBits) \| ((x8 << (a_cBits + 2)) >> 1) \| (CF << 63), x2)
1131	*/
1132	add w3, w2, #1 /* w3 = w2 + 1 */
1133
1134	subs w4, w2, #1
1135	mov w5, #(\a_cBits)
1136	csel w4, w5, w5, lo /* w4 = w2 >= 1 ? w2 - 1 : a_cBits - for CF extraction */
1137
1138	ldr\a_LdStSuff w8, [x1]
1139	bfi x8, x0, #(\a_cBits), #1 /* Put CF above the input. */
1140	bfi x8, x8, #(\a_cBits + 1), #(64 - \a_cBits - 1) /* Put repeat the register content above that again. */
1141	.ifne \a_cBits < 32
1142	.ifeq \a_fIntelFlags
1143	bfi x8, x0, #63, #1 /* AMD 8- and 16-bit: Put CF at the very top so w2 == 0 works w/o branching. */
1144	.endif
1145	.endif
1146	rorv x9, x8, x2
1147	str\a_LdStSuff w9, [x1]
1148
1149	/*
1150	* Calculate EFLAGS - only CF and OF.
1151	*/
1152	bfxil x0, x9, #63, #1 /* CF = last bit rotated 'out' */
1153
1154	.ifne \a_fIntelFlags
1155	/* Intel: OF = first rotate step: fEFlags \|= (fInCarry ^ (uint32_t)(uDst >> (a_cBits - 1))) << X86_EFL_OF_BIT; */
1156	eor x11, x8, x8, LSR #1 /* We've got CF in bit #a_cBits in x8 */
1157	lsr w11, w11, #(\a_cBits - 1)
1158	bfi w0, w11, #X86_EFL_OF_BIT, #1
1159	.else
1160	/* AMD: OF = last rotate step: fEFlags \|= X86_EFL_GET_OF_ ## a_cBits(uResult ^ (uResult << 1)); */
1161	eor w11, w9, w9, LSL #1
1162	lsr w11, w11, #(\a_cBits - 1)
1163	bfi w0, w11, #X86_EFL_OF_BIT, #1
1164	.endif
1165
1166	99:
1167	ret
1168	.cfi_endproc
1169	.endm
1170
1171	RCR_8_16_32 iemAImpl_rcr_u8, 8, 1, b
1172	RCR_8_16_32 iemAImpl_rcr_u8_intel, 8, 1, b
1173	RCR_8_16_32 iemAImpl_rcr_u8_amd, 8, 0, b
1174
1175	RCR_8_16_32 iemAImpl_rcr_u16, 16, 1, h
1176	RCR_8_16_32 iemAImpl_rcr_u16_intel, 16, 1, h
1177	RCR_8_16_32 iemAImpl_rcr_u16_amd, 16, 0, h
1178
1179	RCR_8_16_32 iemAImpl_rcr_u32, 32, 1,
1180	RCR_8_16_32 iemAImpl_rcr_u32_intel, 32, 1,
1181	RCR_8_16_32 iemAImpl_rcr_u32_amd, 32, 0,
1182
1183	/** @todo this is slightly slower than the C version (release) on an M2. Investigate why. */
1184	/* uint32_t iemAImpl_rcr_u64(uint32_t fEFlagsIn, uint16_t pu64Dst, uint8_t cShift); /
1185	.macro RCR_64, a_Name, a_fIntelFlags
1186	ALIGNCODE(IEM_AIMPL_FUNCTION_ALIGNMENT)
1187	BEGINPROC_HIDDEN \a_Name
1188	.cfi_startproc
1189
1190	/* Do we need to shift anything at all? */
1191	and w2, w2, #0x3f
1192	cbz w2, 99f
1193
1194	/*
1195	* Do the rotating: (w8 >> w2) \| (CF << (64 - w2)) \| (w2 > 1 ? (w8 << (64 - w2 + 1)) : 0)
1196	*/
1197	and w5, w0, #X86_EFL_CF /* x5 = input CF - for intel OF calc */
1198	neg w4, w2
1199	lslv x3, x5, x4 /* x3 = CF << (64 - w2) */
1200
1201	cmp w2, #1 /* prep for w2 > 1 */
1202	add w4, w4, #1 /* w4 = -w2 + 1; which when & 0x3f =^= 64 - 2 + 1 */
1203
1204	ldr x8, [x1]
1205	lsrv x9, x8, x2
1206	lslv x10, x8, x4
1207	csel x10, xzr, x10, eq /* if w2 == 1: x10 = 0; else: x10 = x8 << (64 - w2 + 1); */
1208	orr x9, x9, x3 /* shifted CF */
1209	orr x9, x9, x10
1210	str x9, [x1]
1211
1212	/*
1213	* Calculate EFLAGS - only CF and OF.
1214	*/
1215	sub x11, x2, #1
1216	lsr x11, x8, x11
1217	bfi w0, w11, #0, #1 /* CF = last bit rotated out. */
1218
1219	.ifne \a_fIntelFlags
1220	/* Intel: OF = first rotate step: fEFlags \|= (fInCarry ^ (uint32_t)(uDst >> (a_cBits - 1))) << X86_EFL_OF_BIT; */
1221	eor x11, x5, x8, LSR #63
1222	bfi w0, w11, #X86_EFL_OF_BIT, #1
1223	.else
1224	/* AMD: OF = last rotate step: fEFlags \|= X86_EFL_GET_OF_ ## a_cBits(uResult ^ (uResult << 1)); */
1225	eor x11, x9, x9, LSL #1
1226	lsr x11, x11, #(64 - 1)
1227	bfi w0, w11, #X86_EFL_OF_BIT, #1
1228	.endif
1229
1230	99:
1231	ret
1232	.cfi_endproc
1233	.endm
1234
1235	RCR_64 iemAImpl_rcr_u64, 1
1236	RCR_64 iemAImpl_rcr_u64_intel, 1
1237	RCR_64 iemAImpl_rcr_u64_amd, 0
1238

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format