VirtualBox

source: vbox/trunk/include/iprt/asm.h@ 103013

Last change on this file since 103013 was 103006, checked in by vboxsync, 10 months ago

iprt/asm.h,*: Split out the ASMMem* and related stuff into a separate header, asm-mem.h, so that we can get the RT_ASM_PAGE_SIZE stuff out of the way. [fix]

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 277.2 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# pragma intrinsic(__cpuid)
60# pragma intrinsic(__stosd)
61# pragma intrinsic(__stosw)
62# pragma intrinsic(__stosb)
63# pragma intrinsic(_BitScanForward)
64# pragma intrinsic(_BitScanReverse)
65# pragma intrinsic(_bittest)
66# pragma intrinsic(_bittestandset)
67# pragma intrinsic(_bittestandreset)
68# pragma intrinsic(_bittestandcomplement)
69# pragma intrinsic(_byteswap_ushort)
70# pragma intrinsic(_byteswap_ulong)
71# pragma intrinsic(_interlockedbittestandset)
72# pragma intrinsic(_interlockedbittestandreset)
73# pragma intrinsic(_InterlockedAnd)
74# pragma intrinsic(_InterlockedOr)
75# pragma intrinsic(_InterlockedXor)
76# pragma intrinsic(_InterlockedIncrement)
77# pragma intrinsic(_InterlockedDecrement)
78# pragma intrinsic(_InterlockedExchange)
79# pragma intrinsic(_InterlockedExchangeAdd)
80# pragma intrinsic(_InterlockedCompareExchange)
81# pragma intrinsic(_InterlockedCompareExchange8)
82# pragma intrinsic(_InterlockedCompareExchange16)
83# pragma intrinsic(_InterlockedCompareExchange64)
84# pragma intrinsic(_rotl)
85# pragma intrinsic(_rotr)
86# pragma intrinsic(_rotl64)
87# pragma intrinsic(_rotr64)
88# ifdef RT_ARCH_AMD64
89# pragma intrinsic(__stosq)
90# pragma intrinsic(_byteswap_uint64)
91# pragma intrinsic(_InterlockedCompareExchange128)
92# pragma intrinsic(_InterlockedExchange64)
93# pragma intrinsic(_InterlockedExchangeAdd64)
94# pragma intrinsic(_InterlockedAnd64)
95# pragma intrinsic(_InterlockedOr64)
96# pragma intrinsic(_InterlockedIncrement64)
97# pragma intrinsic(_InterlockedDecrement64)
98# endif
99#endif
100
101#if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
102/** @def RTASM_ARM64_USE_FEAT_LSE
103 * Use instructions from the FEAT_LSE set to implement atomic operations,
104 * assuming that the host CPU always supports these. */
105# define RTASM_ARM64_USE_FEAT_LSE 1
106/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
107 * Set to use DMB w/o barrier in most places and rely on the acquire-release
108 * aspects to do the serializing. The assumption is that the tstRTInline
109 * benchmark may be skewing the results testing an unusual scenario. */
110# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
111#endif
112
113
114/*
115 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
116 */
117#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
118# include "asm-watcom-x86-16.h"
119#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
120# include "asm-watcom-x86-32.h"
121#endif
122
123
124/** @defgroup grp_rt_asm ASM - Assembly Routines
125 * @ingroup grp_rt
126 *
127 * @remarks The difference between ordered and unordered atomic operations are
128 * that the former will complete outstanding reads and writes before
129 * continuing while the latter doesn't make any promises about the
130 * order. Ordered operations doesn't, it seems, make any 100% promise
131 * wrt to whether the operation will complete before any subsequent
132 * memory access. (please, correct if wrong.)
133 *
134 * ASMAtomicSomething operations are all ordered, while
135 * ASMAtomicUoSomething are unordered (note the Uo).
136 *
137 * Please note that ordered operations does not necessarily imply a
138 * compiler (memory) barrier. The user has to use the
139 * ASMCompilerBarrier() macro when that is deemed necessary.
140 *
141 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
142 * to reorder or even optimize assembler instructions away. For
143 * instance, in the following code the second rdmsr instruction is
144 * optimized away because gcc treats that instruction as deterministic:
145 *
146 * @code
147 * static inline uint64_t rdmsr_low(int idx)
148 * {
149 * uint32_t low;
150 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
151 * }
152 * ...
153 * uint32_t msr1 = rdmsr_low(1);
154 * foo(msr1);
155 * msr1 = rdmsr_low(1);
156 * bar(msr1);
157 * @endcode
158 *
159 * The input parameter of rdmsr_low is the same for both calls and
160 * therefore gcc will use the result of the first call as input
161 * parameter for bar() as well. For rdmsr this is not acceptable as
162 * this instruction is _not_ deterministic. This applies to reading
163 * machine status information in general.
164 *
165 * @{
166 */
167
168
169/** @def RT_INLINE_ASM_GCC_4_3_X_X86
170 * Used to work around some 4.3.x register allocation issues in this version of
171 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
172 * definitely not for 5.x */
173#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
174# define RT_INLINE_ASM_GCC_4_3_X_X86 1
175#else
176# define RT_INLINE_ASM_GCC_4_3_X_X86 0
177#endif
178
179/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
180 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
181 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
182 * mode, x86.
183 *
184 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
185 * when in PIC mode on x86.
186 */
187#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
188# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
189# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
190# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
191# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
192# elif ( (defined(PIC) || defined(__PIC__)) \
193 && defined(RT_ARCH_X86) \
194 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
195 || defined(RT_OS_DARWIN)) )
196# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
197# else
198# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
199# endif
200#endif
201
202
203/*
204 * ARM is great fun.
205 */
206#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
207
208# define RTASM_ARM_NO_BARRIER
209# ifdef RT_ARCH_ARM64
210# define RTASM_ARM_NO_BARRIER_IN_REG
211# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
212# define RTASM_ARM_DSB_SY "dsb sy\n\t"
213# define RTASM_ARM_DSB_SY_IN_REG
214# define RTASM_ARM_DSB_SY_COMMA_IN_REG
215# define RTASM_ARM_DMB_SY "dmb sy\n\t"
216# define RTASM_ARM_DMB_SY_IN_REG
217# define RTASM_ARM_DMB_SY_COMMA_IN_REG
218# define RTASM_ARM_DMB_ST "dmb st\n\t"
219# define RTASM_ARM_DMB_ST_IN_REG
220# define RTASM_ARM_DMB_ST_COMMA_IN_REG
221# define RTASM_ARM_DMB_LD "dmb ld\n\t"
222# define RTASM_ARM_DMB_LD_IN_REG
223# define RTASM_ARM_DMB_LD_COMMA_IN_REG
224# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
225# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
226 uint32_t rcSpill; \
227 uint32_t u32NewRet; \
228 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
229 RTASM_ARM_##barrier_type /* before lable? */ \
230 "ldaxr %w[uNew], %[pMem]\n\t" \
231 modify64 \
232 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
233 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
234 : [pMem] "+Q" (*a_pu32Mem) \
235 , [uNew] "=&r" (u32NewRet) \
236 , [rc] "=&r" (rcSpill) \
237 : in_reg \
238 : "cc")
239# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
240 uint32_t rcSpill; \
241 uint32_t u32OldRet; \
242 uint32_t u32NewSpill; \
243 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
244 RTASM_ARM_##barrier_type /* before lable? */ \
245 "ldaxr %w[uOld], %[pMem]\n\t" \
246 modify64 \
247 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
248 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
249 : [pMem] "+Q" (*a_pu32Mem) \
250 , [uOld] "=&r" (u32OldRet) \
251 , [uNew] "=&r" (u32NewSpill) \
252 , [rc] "=&r" (rcSpill) \
253 : in_reg \
254 : "cc")
255# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
256 uint32_t rcSpill; \
257 uint64_t u64NewRet; \
258 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
259 RTASM_ARM_##barrier_type /* before lable? */ \
260 "ldaxr %[uNew], %[pMem]\n\t" \
261 modify64 \
262 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
263 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
264 : [pMem] "+Q" (*a_pu64Mem) \
265 , [uNew] "=&r" (u64NewRet) \
266 , [rc] "=&r" (rcSpill) \
267 : in_reg \
268 : "cc")
269# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
270 uint32_t rcSpill; \
271 uint64_t u64OldRet; \
272 uint64_t u64NewSpill; \
273 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
274 RTASM_ARM_##barrier_type /* before lable? */ \
275 "ldaxr %[uOld], %[pMem]\n\t" \
276 modify64 \
277 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
278 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
279 : [pMem] "+Q" (*a_pu64Mem) \
280 , [uOld] "=&r" (u64OldRet) \
281 , [uNew] "=&r" (u64NewSpill) \
282 , [rc] "=&r" (rcSpill) \
283 : in_reg \
284 : "cc")
285
286# else /* RT_ARCH_ARM32 */
287# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
288# if RT_ARCH_ARM32 >= 7
289# warning armv7
290# define RTASM_ARM_NO_BARRIER_IN_REG
291# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
292# define RTASM_ARM_DSB_SY "dsb sy\n\t"
293# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
294# define RTASM_ARM_DMB_SY "dmb sy\n\t"
295# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
296# define RTASM_ARM_DMB_ST "dmb st\n\t"
297# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
298# define RTASM_ARM_DMB_LD "dmb ld\n\t"
299# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
300
301# elif RT_ARCH_ARM32 >= 6
302# warning armv6
303# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
304# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
305# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
306# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
307# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
308# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
309# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
310# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
311
312# elif RT_ARCH_ARM32 >= 4
313# warning armv5 or older
314# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
315# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
316# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
317# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
318# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
319# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
320# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
321# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
322# else
323# error "huh? Odd RT_ARCH_ARM32 value!"
324# endif
325# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
326# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
327# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
328# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
329# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
330 uint32_t rcSpill; \
331 uint32_t u32NewRet; \
332 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
333 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
334 "ldrex %[uNew], %[pMem]\n\t" \
335 modify32 \
336 "strex %[rc], %[uNew], %[pMem]\n\t" \
337 "cmp %[rc], #0\n\t" \
338 "bne Ltry_again_" #name "_%=\n\t" \
339 : [pMem] "+m" (*a_pu32Mem) \
340 , [uNew] "=&r" (u32NewRet) \
341 , [rc] "=&r" (rcSpill) \
342 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
343 , in_reg \
344 : "cc")
345# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
346 uint32_t rcSpill; \
347 uint32_t u32OldRet; \
348 uint32_t u32NewSpill; \
349 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
350 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
351 "ldrex %[uOld], %[pMem]\n\t" \
352 modify32 \
353 "strex %[rc], %[uNew], %[pMem]\n\t" \
354 "cmp %[rc], #0\n\t" \
355 "bne Ltry_again_" #name "_%=\n\t" \
356 : [pMem] "+m" (*a_pu32Mem) \
357 , [uOld] "=&r" (u32OldRet) \
358 , [uNew] "=&r" (u32NewSpill) \
359 , [rc] "=&r" (rcSpill) \
360 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
361 , in_reg \
362 : "cc")
363# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
364 uint32_t rcSpill; \
365 uint64_t u64NewRet; \
366 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
367 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
368 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
369 modify32 \
370 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
371 "cmp %[rc], #0\n\t" \
372 "bne Ltry_again_" #name "_%=\n\t" \
373 : [pMem] "+m" (*a_pu64Mem), \
374 [uNew] "=&r" (u64NewRet), \
375 [rc] "=&r" (rcSpill) \
376 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
377 , in_reg \
378 : "cc")
379# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
380 uint32_t rcSpill; \
381 uint64_t u64OldRet; \
382 uint64_t u64NewSpill; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
386 modify32 \
387 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu64Mem), \
391 [uOld] "=&r" (u64OldRet), \
392 [uNew] "=&r" (u64NewSpill), \
393 [rc] "=&r" (rcSpill) \
394 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
395 , in_reg \
396 : "cc")
397# endif /* RT_ARCH_ARM32 */
398#endif
399
400
401/** @def ASMReturnAddress
402 * Gets the return address of the current (or calling if you like) function or method.
403 */
404#ifdef _MSC_VER
405# ifdef __cplusplus
406extern "C"
407# endif
408void * _ReturnAddress(void);
409# pragma intrinsic(_ReturnAddress)
410# define ASMReturnAddress() _ReturnAddress()
411#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
412# define ASMReturnAddress() __builtin_return_address(0)
413#elif defined(__WATCOMC__)
414# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
415#else
416# error "Unsupported compiler."
417#endif
418
419
420/**
421 * Compiler memory barrier.
422 *
423 * Ensure that the compiler does not use any cached (register/tmp stack) memory
424 * values or any outstanding writes when returning from this function.
425 *
426 * This function must be used if non-volatile data is modified by a
427 * device or the VMM. Typical cases are port access, MMIO access,
428 * trapping instruction, etc.
429 */
430#if RT_INLINE_ASM_GNU_STYLE
431# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
432#elif RT_INLINE_ASM_USES_INTRIN
433# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
434#elif defined(__WATCOMC__)
435void ASMCompilerBarrier(void);
436#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
437DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
438{
439 __asm
440 {
441 }
442}
443#endif
444
445
446/** @def ASMBreakpoint
447 * Debugger Breakpoint.
448 * @deprecated Use RT_BREAKPOINT instead.
449 * @internal
450 */
451#define ASMBreakpoint() RT_BREAKPOINT()
452
453
454/**
455 * Spinloop hint for platforms that have these, empty function on the other
456 * platforms.
457 *
458 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
459 * spin locks.
460 */
461#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
462RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
463#else
464DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
465{
466# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
467# if RT_INLINE_ASM_GNU_STYLE
468 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
469# else
470 __asm {
471 _emit 0f3h
472 _emit 090h
473 }
474# endif
475
476# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
477 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
478
479# else
480 /* dummy */
481# endif
482}
483#endif
484
485
486/**
487 * Atomically Exchange an unsigned 8-bit value, ordered.
488 *
489 * @returns Current *pu8 value
490 * @param pu8 Pointer to the 8-bit variable to update.
491 * @param u8 The 8-bit value to assign to *pu8.
492 */
493#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
494RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
495#else
496DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
497{
498# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
499# if RT_INLINE_ASM_GNU_STYLE
500 __asm__ __volatile__("xchgb %0, %1\n\t"
501 : "=m" (*pu8)
502 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
503 : "1" (u8)
504 , "m" (*pu8));
505# else
506 __asm
507 {
508# ifdef RT_ARCH_AMD64
509 mov rdx, [pu8]
510 mov al, [u8]
511 xchg [rdx], al
512 mov [u8], al
513# else
514 mov edx, [pu8]
515 mov al, [u8]
516 xchg [edx], al
517 mov [u8], al
518# endif
519 }
520# endif
521 return u8;
522
523# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
524 uint32_t uOld;
525# if defined(RTASM_ARM64_USE_FEAT_LSE)
526 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
527 have the barrier we shouldn't need that, right? Ordering should be taken
528 care of by the DMB. The SWPB is rather cheap (~70% faster). */
529 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
530# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
531 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
532# else
533 RTASM_ARM_DMB_SY
534 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
535# endif
536 : [pMem] "+Q" (*pu8)
537 , [uOld] "=&r" (uOld)
538 : [uNew] "r" ((uint32_t)u8)
539 : );
540# else
541 uint32_t rcSpill;
542 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
543 RTASM_ARM_DMB_SY
544# if defined(RT_ARCH_ARM64)
545 "ldaxrb %w[uOld], %[pMem]\n\t"
546 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
547 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
548# else
549 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
550 "strexb %[rc], %[uNew], %[pMem]\n\t"
551 "cmp %[rc], #0\n\t"
552 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
553# endif
554 : [pMem] "+Q" (*pu8)
555 , [uOld] "=&r" (uOld)
556 , [rc] "=&r" (rcSpill)
557 : [uNew] "r" ((uint32_t)u8)
558 RTASM_ARM_DMB_SY_COMMA_IN_REG
559 : "cc");
560# endif
561 return (uint8_t)uOld;
562
563# else
564# error "Port me"
565# endif
566}
567#endif
568
569
570/**
571 * Atomically Exchange a signed 8-bit value, ordered.
572 *
573 * @returns Current *pu8 value
574 * @param pi8 Pointer to the 8-bit variable to update.
575 * @param i8 The 8-bit value to assign to *pi8.
576 */
577DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
578{
579 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
580}
581
582
583/**
584 * Atomically Exchange a bool value, ordered.
585 *
586 * @returns Current *pf value
587 * @param pf Pointer to the 8-bit variable to update.
588 * @param f The 8-bit value to assign to *pi8.
589 */
590DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
591{
592#ifdef _MSC_VER
593 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
594#else
595 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
596#endif
597}
598
599
600/**
601 * Atomically Exchange an unsigned 16-bit value, ordered.
602 *
603 * @returns Current *pu16 value
604 * @param pu16 Pointer to the 16-bit variable to update.
605 * @param u16 The 16-bit value to assign to *pu16.
606 */
607#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
608RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
609#else
610DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
611{
612# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
613# if RT_INLINE_ASM_GNU_STYLE
614 __asm__ __volatile__("xchgw %0, %1\n\t"
615 : "=m" (*pu16)
616 , "=r" (u16)
617 : "1" (u16)
618 , "m" (*pu16));
619# else
620 __asm
621 {
622# ifdef RT_ARCH_AMD64
623 mov rdx, [pu16]
624 mov ax, [u16]
625 xchg [rdx], ax
626 mov [u16], ax
627# else
628 mov edx, [pu16]
629 mov ax, [u16]
630 xchg [edx], ax
631 mov [u16], ax
632# endif
633 }
634# endif
635 return u16;
636
637# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
638 uint32_t uOld;
639# if defined(RTASM_ARM64_USE_FEAT_LSE)
640 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
641 slower if we remove the barrier. But since we have the barrier we
642 shouldn't need that, right? Ordering should be taken care of by the DMB.
643 The SWPH is rather cheap (~70% faster). */
644 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
645# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
646 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
647# else
648 RTASM_ARM_DMB_SY
649 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
650# endif
651 : [pMem] "+Q" (*pu16)
652 , [uOld] "=&r" (uOld)
653 : [uNew] "r" ((uint32_t)u16)
654 : );
655# else
656 uint32_t rcSpill;
657 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
658 RTASM_ARM_DMB_SY
659# if defined(RT_ARCH_ARM64)
660 "ldaxrh %w[uOld], %[pMem]\n\t"
661 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
662 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
663# else
664 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
665 "strexh %[rc], %[uNew], %[pMem]\n\t"
666 "cmp %[rc], #0\n\t"
667 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
668# endif
669 : [pMem] "+Q" (*pu16)
670 , [uOld] "=&r" (uOld)
671 , [rc] "=&r" (rcSpill)
672 : [uNew] "r" ((uint32_t)u16)
673 RTASM_ARM_DMB_SY_COMMA_IN_REG
674 : "cc");
675# endif
676 return (uint16_t)uOld;
677
678# else
679# error "Port me"
680# endif
681}
682#endif
683
684
685/**
686 * Atomically Exchange a signed 16-bit value, ordered.
687 *
688 * @returns Current *pu16 value
689 * @param pi16 Pointer to the 16-bit variable to update.
690 * @param i16 The 16-bit value to assign to *pi16.
691 */
692DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
693{
694 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
695}
696
697
698/**
699 * Atomically Exchange an unsigned 32-bit value, ordered.
700 *
701 * @returns Current *pu32 value
702 * @param pu32 Pointer to the 32-bit variable to update.
703 * @param u32 The 32-bit value to assign to *pu32.
704 *
705 * @remarks Does not work on 286 and earlier.
706 */
707#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
708RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
709#else
710DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
711{
712# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
713# if RT_INLINE_ASM_GNU_STYLE
714 __asm__ __volatile__("xchgl %0, %1\n\t"
715 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
716 , "=r" (u32)
717 : "1" (u32)
718 , "m" (*pu32));
719
720# elif RT_INLINE_ASM_USES_INTRIN
721 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
722
723# else
724 __asm
725 {
726# ifdef RT_ARCH_AMD64
727 mov rdx, [pu32]
728 mov eax, u32
729 xchg [rdx], eax
730 mov [u32], eax
731# else
732 mov edx, [pu32]
733 mov eax, u32
734 xchg [edx], eax
735 mov [u32], eax
736# endif
737 }
738# endif
739 return u32;
740
741# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
742 uint32_t uOld;
743# if defined(RTASM_ARM64_USE_FEAT_LSE)
744 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
745 slower if we remove the barrier. But since we have the barrier we
746 shouldn't need that, right? Ordering should be taken care of by the DMB.
747 The SWP is rather cheap (~70% faster). */
748 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
749# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
750 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
751# else
752 RTASM_ARM_DMB_SY
753 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
754# endif
755 : [pMem] "+Q" (*pu32)
756 , [uOld] "=&r" (uOld)
757 : [uNew] "r" (u32)
758 : );
759# else
760 uint32_t rcSpill;
761 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
762 RTASM_ARM_DMB_SY
763# if defined(RT_ARCH_ARM64)
764 "ldaxr %w[uOld], %[pMem]\n\t"
765 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
766 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
767# else
768 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
769 "strex %[rc], %[uNew], %[pMem]\n\t"
770 "cmp %[rc], #0\n\t"
771 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
772# endif
773 : [pMem] "+Q" (*pu32)
774 , [uOld] "=&r" (uOld)
775 , [rc] "=&r" (rcSpill)
776 : [uNew] "r" (u32)
777 RTASM_ARM_DMB_SY_COMMA_IN_REG
778 : "cc");
779# endif
780 return uOld;
781
782# else
783# error "Port me"
784# endif
785}
786#endif
787
788
789/**
790 * Atomically Exchange a signed 32-bit value, ordered.
791 *
792 * @returns Current *pu32 value
793 * @param pi32 Pointer to the 32-bit variable to update.
794 * @param i32 The 32-bit value to assign to *pi32.
795 */
796DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
797{
798 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
799}
800
801
802/**
803 * Atomically Exchange an unsigned 64-bit value, ordered.
804 *
805 * @returns Current *pu64 value
806 * @param pu64 Pointer to the 64-bit variable to update.
807 * @param u64 The 64-bit value to assign to *pu64.
808 *
809 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
810 */
811#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
812 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
813RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
814#else
815DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
816{
817# if defined(RT_ARCH_AMD64)
818# if RT_INLINE_ASM_USES_INTRIN
819 return _InterlockedExchange64((__int64 *)pu64, u64);
820
821# elif RT_INLINE_ASM_GNU_STYLE
822 __asm__ __volatile__("xchgq %0, %1\n\t"
823 : "=m" (*pu64)
824 , "=r" (u64)
825 : "1" (u64)
826 , "m" (*pu64));
827 return u64;
828# else
829 __asm
830 {
831 mov rdx, [pu64]
832 mov rax, [u64]
833 xchg [rdx], rax
834 mov [u64], rax
835 }
836 return u64;
837# endif
838
839# elif defined(RT_ARCH_X86)
840# if RT_INLINE_ASM_GNU_STYLE
841# if defined(PIC) || defined(__PIC__)
842 uint32_t u32EBX = (uint32_t)u64;
843 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
844 "xchgl %%ebx, %3\n\t"
845 "1:\n\t"
846 "lock; cmpxchg8b (%5)\n\t"
847 "jnz 1b\n\t"
848 "movl %3, %%ebx\n\t"
849 /*"xchgl %%esi, %5\n\t"*/
850 : "=A" (u64)
851 , "=m" (*pu64)
852 : "0" (*pu64)
853 , "m" ( u32EBX )
854 , "c" ( (uint32_t)(u64 >> 32) )
855 , "S" (pu64)
856 : "cc");
857# else /* !PIC */
858 __asm__ __volatile__("1:\n\t"
859 "lock; cmpxchg8b %1\n\t"
860 "jnz 1b\n\t"
861 : "=A" (u64)
862 , "=m" (*pu64)
863 : "0" (*pu64)
864 , "b" ( (uint32_t)u64 )
865 , "c" ( (uint32_t)(u64 >> 32) )
866 : "cc");
867# endif
868# else
869 __asm
870 {
871 mov ebx, dword ptr [u64]
872 mov ecx, dword ptr [u64 + 4]
873 mov edi, pu64
874 mov eax, dword ptr [edi]
875 mov edx, dword ptr [edi + 4]
876 retry:
877 lock cmpxchg8b [edi]
878 jnz retry
879 mov dword ptr [u64], eax
880 mov dword ptr [u64 + 4], edx
881 }
882# endif
883 return u64;
884
885# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
886 uint64_t uOld;
887# if defined(RTASM_ARM64_USE_FEAT_LSE)
888 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
889 slower if we remove the barrier. But since we have the barrier we
890 shouldn't need that, right? Ordering should be taken care of by the DMB.
891 The SWP is rather cheap (~70% faster). */
892 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
893# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
894 "swpal %[uNew], %[uOld], %[pMem]\n\t"
895# else
896 RTASM_ARM_DMB_SY
897 "swp %[uNew], %[uOld], %[pMem]\n\t"
898# endif
899 : [pMem] "+Q" (*pu64)
900 , [uOld] "=&r" (uOld)
901 : [uNew] "r" (u64)
902 : );
903# else
904 uint32_t rcSpill;
905 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
906 RTASM_ARM_DMB_SY
907# if defined(RT_ARCH_ARM64)
908 "ldaxr %[uOld], %[pMem]\n\t"
909 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
910 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
911# else
912 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
913 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
914 "cmp %[rc], #0\n\t"
915 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
916# endif
917 : [pMem] "+Q" (*pu64)
918 , [uOld] "=&r" (uOld)
919 , [rc] "=&r" (rcSpill)
920 : [uNew] "r" (u64)
921 RTASM_ARM_DMB_SY_COMMA_IN_REG
922 : "cc");
923# endif
924 return uOld;
925
926# else
927# error "Port me"
928# endif
929}
930#endif
931
932
933/**
934 * Atomically Exchange an signed 64-bit value, ordered.
935 *
936 * @returns Current *pi64 value
937 * @param pi64 Pointer to the 64-bit variable to update.
938 * @param i64 The 64-bit value to assign to *pi64.
939 */
940DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
941{
942 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
943}
944
945
946/**
947 * Atomically Exchange a size_t value, ordered.
948 *
949 * @returns Current *ppv value
950 * @param puDst Pointer to the size_t variable to update.
951 * @param uNew The new value to assign to *puDst.
952 */
953DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
954{
955#if ARCH_BITS == 16
956 AssertCompile(sizeof(size_t) == 2);
957 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
958#elif ARCH_BITS == 32
959 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
960#elif ARCH_BITS == 64
961 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
962#else
963# error "ARCH_BITS is bogus"
964#endif
965}
966
967
968/**
969 * Atomically Exchange a pointer value, ordered.
970 *
971 * @returns Current *ppv value
972 * @param ppv Pointer to the pointer variable to update.
973 * @param pv The pointer value to assign to *ppv.
974 */
975DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
976{
977#if ARCH_BITS == 32 || ARCH_BITS == 16
978 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
979#elif ARCH_BITS == 64
980 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
981#else
982# error "ARCH_BITS is bogus"
983#endif
984}
985
986
987/**
988 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
989 *
990 * @returns Current *pv value
991 * @param ppv Pointer to the pointer variable to update.
992 * @param pv The pointer value to assign to *ppv.
993 * @param Type The type of *ppv, sans volatile.
994 */
995#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
996# define ASMAtomicXchgPtrT(ppv, pv, Type) \
997 __extension__ \
998 ({\
999 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1000 Type const pvTypeChecked = (pv); \
1001 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1002 pvTypeCheckedRet; \
1003 })
1004#else
1005# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1006 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1007#endif
1008
1009
1010/**
1011 * Atomically Exchange a raw-mode context pointer value, ordered.
1012 *
1013 * @returns Current *ppv value
1014 * @param ppvRC Pointer to the pointer variable to update.
1015 * @param pvRC The pointer value to assign to *ppv.
1016 */
1017DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1018{
1019 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1020}
1021
1022
1023/**
1024 * Atomically Exchange a ring-0 pointer value, ordered.
1025 *
1026 * @returns Current *ppv value
1027 * @param ppvR0 Pointer to the pointer variable to update.
1028 * @param pvR0 The pointer value to assign to *ppv.
1029 */
1030DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1031{
1032#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1033 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1034#elif R0_ARCH_BITS == 64
1035 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1036#else
1037# error "R0_ARCH_BITS is bogus"
1038#endif
1039}
1040
1041
1042/**
1043 * Atomically Exchange a ring-3 pointer value, ordered.
1044 *
1045 * @returns Current *ppv value
1046 * @param ppvR3 Pointer to the pointer variable to update.
1047 * @param pvR3 The pointer value to assign to *ppv.
1048 */
1049DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1050{
1051#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1052 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1053#elif R3_ARCH_BITS == 64
1054 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1055#else
1056# error "R3_ARCH_BITS is bogus"
1057#endif
1058}
1059
1060
1061/** @def ASMAtomicXchgHandle
1062 * Atomically Exchange a typical IPRT handle value, ordered.
1063 *
1064 * @param ph Pointer to the value to update.
1065 * @param hNew The new value to assigned to *pu.
1066 * @param phRes Where to store the current *ph value.
1067 *
1068 * @remarks This doesn't currently work for all handles (like RTFILE).
1069 */
1070#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1071# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1072 do { \
1073 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1074 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1075 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1076 } while (0)
1077#elif HC_ARCH_BITS == 64
1078# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1079 do { \
1080 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1081 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1082 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1083 } while (0)
1084#else
1085# error HC_ARCH_BITS
1086#endif
1087
1088
1089/**
1090 * Atomically Exchange a value which size might differ
1091 * between platforms or compilers, ordered.
1092 *
1093 * @param pu Pointer to the variable to update.
1094 * @param uNew The value to assign to *pu.
1095 * @todo This is busted as its missing the result argument.
1096 */
1097#define ASMAtomicXchgSize(pu, uNew) \
1098 do { \
1099 switch (sizeof(*(pu))) { \
1100 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1101 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1102 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1103 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1104 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1105 } \
1106 } while (0)
1107
1108/**
1109 * Atomically Exchange a value which size might differ
1110 * between platforms or compilers, ordered.
1111 *
1112 * @param pu Pointer to the variable to update.
1113 * @param uNew The value to assign to *pu.
1114 * @param puRes Where to store the current *pu value.
1115 */
1116#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1117 do { \
1118 switch (sizeof(*(pu))) { \
1119 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1120 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1121 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1122 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1123 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1124 } \
1125 } while (0)
1126
1127
1128
1129/**
1130 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1131 *
1132 * @returns true if xchg was done.
1133 * @returns false if xchg wasn't done.
1134 *
1135 * @param pu8 Pointer to the value to update.
1136 * @param u8New The new value to assigned to *pu8.
1137 * @param u8Old The old value to *pu8 compare with.
1138 *
1139 * @remarks x86: Requires a 486 or later.
1140 * @todo Rename ASMAtomicCmpWriteU8
1141 */
1142#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE
1143RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1144#else
1145DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1146{
1147# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1148 uint8_t u8Ret;
1149 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1150 "setz %1\n\t"
1151 : "=m" (*pu8)
1152 , "=qm" (u8Ret)
1153 , "=a" (u8Old)
1154 : "q" (u8New)
1155 , "2" (u8Old)
1156 , "m" (*pu8)
1157 : "cc");
1158 return (bool)u8Ret;
1159
1160# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1161 union { uint32_t u; bool f; } fXchg;
1162 uint32_t u32Spill;
1163# if defined(RTASM_ARM64_USE_FEAT_LSE)
1164 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1165# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1166 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1167# else
1168 RTASM_ARM_DMB_SY
1169 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1170# endif
1171 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1172 "cset %w[fXchg], eq\n\t"
1173 : [pMem] "+Q" (*pu8)
1174 , [uOldActual] "=&r" (u32Spill)
1175 , [fXchg] "=&r" (fXchg.u)
1176 : [uNew] "r" ((uint32_t)u8New)
1177 , [uOldOrg] "r" ((uint32_t)u8Old)
1178 , "[uOldActual]" ((uint32_t)u8Old)
1179 : "cc");
1180# else
1181 uint32_t rcSpill;
1182 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1183 RTASM_ARM_DMB_SY
1184# if defined(RT_ARCH_ARM64)
1185 "ldaxrb %w[uOld], %[pMem]\n\t"
1186 "cmp %w[uOld], %w[uCmp]\n\t"
1187 "bne 1f\n\t" /* stop here if not equal */
1188 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1189 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1190 "mov %w[fXchg], #1\n\t"
1191 "1:\n\t"
1192 "clrex\n\t"
1193# else
1194 "ldrexb %[uOld], %[pMem]\n\t"
1195 "teq %[uOld], %[uCmp]\n\t"
1196 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1197 "bne 1f\n\t" /* stop here if not equal */
1198 "cmp %[rc], #0\n\t"
1199 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1200 "mov %[fXchg], #1\n\t"
1201 "1:\n\t"
1202 /** @todo clrexne on armv7? */
1203# endif
1204 : [pMem] "+Q" (*pu8)
1205 , [uOld] "=&r" (u32Spill)
1206 , [rc] "=&r" (rcSpill)
1207 , [fXchg] "=&r" (fXchg.u)
1208 : [uCmp] "r" ((uint32_t)u8Old)
1209 , [uNew] "r" ((uint32_t)u8New)
1210 , "[fXchg]" (0)
1211 RTASM_ARM_DMB_SY_COMMA_IN_REG
1212 : "cc");
1213# endif
1214 return fXchg.f;
1215
1216# else
1217# error "Port me"
1218# endif
1219}
1220#endif
1221
1222
1223/**
1224 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1225 *
1226 * @returns true if xchg was done.
1227 * @returns false if xchg wasn't done.
1228 *
1229 * @param pi8 Pointer to the value to update.
1230 * @param i8New The new value to assigned to *pi8.
1231 * @param i8Old The old value to *pi8 compare with.
1232 *
1233 * @remarks x86: Requires a 486 or later.
1234 * @todo Rename ASMAtomicCmpWriteS8
1235 */
1236DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1237{
1238 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1239}
1240
1241
1242/**
1243 * Atomically Compare and Exchange a bool value, ordered.
1244 *
1245 * @returns true if xchg was done.
1246 * @returns false if xchg wasn't done.
1247 *
1248 * @param pf Pointer to the value to update.
1249 * @param fNew The new value to assigned to *pf.
1250 * @param fOld The old value to *pf compare with.
1251 *
1252 * @remarks x86: Requires a 486 or later.
1253 * @todo Rename ASMAtomicCmpWriteBool
1254 */
1255DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1256{
1257 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1258}
1259
1260
1261/**
1262 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1263 *
1264 * @returns true if xchg was done.
1265 * @returns false if xchg wasn't done.
1266 *
1267 * @param pu32 Pointer to the value to update.
1268 * @param u32New The new value to assigned to *pu32.
1269 * @param u32Old The old value to *pu32 compare with.
1270 *
1271 * @remarks x86: Requires a 486 or later.
1272 * @todo Rename ASMAtomicCmpWriteU32
1273 */
1274#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1275RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1276#else
1277DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1278{
1279# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1280# if RT_INLINE_ASM_GNU_STYLE
1281 uint8_t u8Ret;
1282 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1283 "setz %1\n\t"
1284 : "=m" (*pu32)
1285 , "=qm" (u8Ret)
1286 , "=a" (u32Old)
1287 : "r" (u32New)
1288 , "2" (u32Old)
1289 , "m" (*pu32)
1290 : "cc");
1291 return (bool)u8Ret;
1292
1293# elif RT_INLINE_ASM_USES_INTRIN
1294 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1295
1296# else
1297 uint32_t u32Ret;
1298 __asm
1299 {
1300# ifdef RT_ARCH_AMD64
1301 mov rdx, [pu32]
1302# else
1303 mov edx, [pu32]
1304# endif
1305 mov eax, [u32Old]
1306 mov ecx, [u32New]
1307# ifdef RT_ARCH_AMD64
1308 lock cmpxchg [rdx], ecx
1309# else
1310 lock cmpxchg [edx], ecx
1311# endif
1312 setz al
1313 movzx eax, al
1314 mov [u32Ret], eax
1315 }
1316 return !!u32Ret;
1317# endif
1318
1319# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1320 union { uint32_t u; bool f; } fXchg;
1321 uint32_t u32Spill;
1322 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1323 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1324# if defined(RTASM_ARM64_USE_FEAT_LSE)
1325 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1326# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1327 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1328# else
1329 RTASM_ARM_DMB_SY
1330 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1331# endif
1332 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1333 "cset %w[fXchg], eq\n\t"
1334 : [pMem] "+Q" (*pu32)
1335 , [uOldActual] "=&r" (u32Spill)
1336 , [fXchg] "=&r" (fXchg.u)
1337 : [uNew] "r" (u32New)
1338 , [uOldOrg] "r" (u32Old)
1339 , "[uOldActual]" (u32Old)
1340 : "cc");
1341# else
1342 uint32_t rcSpill;
1343 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1344 RTASM_ARM_DMB_SY
1345# if defined(RT_ARCH_ARM64)
1346 "ldaxr %w[uOld], %[pMem]\n\t"
1347 "cmp %w[uOld], %w[uCmp]\n\t"
1348 "bne 1f\n\t" /* stop here if not equal */
1349 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1350 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1351 "mov %w[fXchg], #1\n\t"
1352 "1:\n\t"
1353 "clrex\n\t"
1354# else
1355 "ldrex %[uOld], %[pMem]\n\t"
1356 "teq %[uOld], %[uCmp]\n\t"
1357 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1358 "bne 1f\n\t" /* stop here if not equal */
1359 "cmp %[rc], #0\n\t"
1360 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1361 "mov %[fXchg], #1\n\t"
1362 "1:\n\t"
1363 /** @todo clrexne on armv7? */
1364# endif
1365 : [pMem] "+Q" (*pu32)
1366 , [uOld] "=&r" (u32Spill)
1367 , [rc] "=&r" (rcSpill)
1368 , [fXchg] "=&r" (fXchg.u)
1369 : [uCmp] "r" (u32Old)
1370 , [uNew] "r" (u32New)
1371 , "[fXchg]" (0)
1372 RTASM_ARM_DMB_SY_COMMA_IN_REG
1373 : "cc");
1374# endif
1375 return fXchg.f;
1376
1377# else
1378# error "Port me"
1379# endif
1380}
1381#endif
1382
1383
1384/**
1385 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1386 *
1387 * @returns true if xchg was done.
1388 * @returns false if xchg wasn't done.
1389 *
1390 * @param pi32 Pointer to the value to update.
1391 * @param i32New The new value to assigned to *pi32.
1392 * @param i32Old The old value to *pi32 compare with.
1393 *
1394 * @remarks x86: Requires a 486 or later.
1395 * @todo Rename ASMAtomicCmpWriteS32
1396 */
1397DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1398{
1399 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1400}
1401
1402
1403/**
1404 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1405 *
1406 * @returns true if xchg was done.
1407 * @returns false if xchg wasn't done.
1408 *
1409 * @param pu64 Pointer to the 64-bit variable to update.
1410 * @param u64New The 64-bit value to assign to *pu64.
1411 * @param u64Old The value to compare with.
1412 *
1413 * @remarks x86: Requires a Pentium or later.
1414 * @todo Rename ASMAtomicCmpWriteU64
1415 */
1416#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1417 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1418RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1419#else
1420DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1421{
1422# if RT_INLINE_ASM_USES_INTRIN
1423 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1424
1425# elif defined(RT_ARCH_AMD64)
1426# if RT_INLINE_ASM_GNU_STYLE
1427 uint8_t u8Ret;
1428 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1429 "setz %1\n\t"
1430 : "=m" (*pu64)
1431 , "=qm" (u8Ret)
1432 , "=a" (u64Old)
1433 : "r" (u64New)
1434 , "2" (u64Old)
1435 , "m" (*pu64)
1436 : "cc");
1437 return (bool)u8Ret;
1438# else
1439 bool fRet;
1440 __asm
1441 {
1442 mov rdx, [pu32]
1443 mov rax, [u64Old]
1444 mov rcx, [u64New]
1445 lock cmpxchg [rdx], rcx
1446 setz al
1447 mov [fRet], al
1448 }
1449 return fRet;
1450# endif
1451
1452# elif defined(RT_ARCH_X86)
1453 uint32_t u32Ret;
1454# if RT_INLINE_ASM_GNU_STYLE
1455# if defined(PIC) || defined(__PIC__)
1456 uint32_t u32EBX = (uint32_t)u64New;
1457 uint32_t u32Spill;
1458 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1459 "lock; cmpxchg8b (%6)\n\t"
1460 "setz %%al\n\t"
1461 "movl %4, %%ebx\n\t"
1462 "movzbl %%al, %%eax\n\t"
1463 : "=a" (u32Ret)
1464 , "=d" (u32Spill)
1465# if RT_GNUC_PREREQ(4, 3)
1466 , "+m" (*pu64)
1467# else
1468 , "=m" (*pu64)
1469# endif
1470 : "A" (u64Old)
1471 , "m" ( u32EBX )
1472 , "c" ( (uint32_t)(u64New >> 32) )
1473 , "S" (pu64)
1474 : "cc");
1475# else /* !PIC */
1476 uint32_t u32Spill;
1477 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1478 "setz %%al\n\t"
1479 "movzbl %%al, %%eax\n\t"
1480 : "=a" (u32Ret)
1481 , "=d" (u32Spill)
1482 , "+m" (*pu64)
1483 : "A" (u64Old)
1484 , "b" ( (uint32_t)u64New )
1485 , "c" ( (uint32_t)(u64New >> 32) )
1486 : "cc");
1487# endif
1488 return (bool)u32Ret;
1489# else
1490 __asm
1491 {
1492 mov ebx, dword ptr [u64New]
1493 mov ecx, dword ptr [u64New + 4]
1494 mov edi, [pu64]
1495 mov eax, dword ptr [u64Old]
1496 mov edx, dword ptr [u64Old + 4]
1497 lock cmpxchg8b [edi]
1498 setz al
1499 movzx eax, al
1500 mov dword ptr [u32Ret], eax
1501 }
1502 return !!u32Ret;
1503# endif
1504
1505# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1506 union { uint32_t u; bool f; } fXchg;
1507 uint64_t u64Spill;
1508 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1509 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1510# if defined(RTASM_ARM64_USE_FEAT_LSE)
1511 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1512# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1513 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1514# else
1515 RTASM_ARM_DMB_SY
1516 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1517# endif
1518 "cmp %[uOldActual], %[uOldOrg]\n\t"
1519 "cset %w[fXchg], eq\n\t"
1520 : [pMem] "+Q" (*pu64)
1521 , [uOldActual] "=&r" (u64Spill)
1522 , [fXchg] "=&r" (fXchg.u)
1523 : [uNew] "r" (u64New)
1524 , [uOldOrg] "r" (u64Old)
1525 , "[uOldActual]" (u64Old)
1526 : "cc");
1527# else
1528 uint32_t rcSpill;
1529 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1530 RTASM_ARM_DMB_SY
1531# if defined(RT_ARCH_ARM64)
1532 "ldaxr %[uOld], %[pMem]\n\t"
1533 "cmp %[uOld], %[uCmp]\n\t"
1534 "bne 1f\n\t" /* stop here if not equal */
1535 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1536 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1537 "mov %w[fXchg], #1\n\t"
1538 "1:\n\t"
1539 "clrex\n\t"
1540# else
1541 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1542 "teq %[uOld], %[uCmp]\n\t"
1543 "teqeq %H[uOld], %H[uCmp]\n\t"
1544 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1545 "bne 1f\n\t" /* stop here if not equal */
1546 "cmp %[rc], #0\n\t"
1547 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1548 "mov %[fXchg], #1\n\t"
1549 "1:\n\t"
1550 /** @todo clrexne on armv7? */
1551# endif
1552 : [pMem] "+Q" (*pu64)
1553 , [uOld] "=&r" (u64Spill)
1554 , [rc] "=&r" (rcSpill)
1555 , [fXchg] "=&r" (fXchg.u)
1556 : [uCmp] "r" (u64Old)
1557 , [uNew] "r" (u64New)
1558 , "[fXchg]" (0)
1559 RTASM_ARM_DMB_SY_COMMA_IN_REG
1560 : "cc");
1561# endif
1562 return fXchg.f;
1563
1564# else
1565# error "Port me"
1566# endif
1567}
1568#endif
1569
1570
1571/**
1572 * Atomically Compare and exchange a signed 64-bit value, ordered.
1573 *
1574 * @returns true if xchg was done.
1575 * @returns false if xchg wasn't done.
1576 *
1577 * @param pi64 Pointer to the 64-bit variable to update.
1578 * @param i64 The 64-bit value to assign to *pu64.
1579 * @param i64Old The value to compare with.
1580 *
1581 * @remarks x86: Requires a Pentium or later.
1582 * @todo Rename ASMAtomicCmpWriteS64
1583 */
1584DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1585{
1586 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1587}
1588
1589#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1590
1591/** @def RTASM_HAVE_CMP_WRITE_U128
1592 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1593 * and ASMAtomicCmpWriteExU128() available. */
1594# define RTASM_HAVE_CMP_WRITE_U128 1
1595
1596
1597/**
1598 * Atomically compare and write an unsigned 128-bit value, ordered.
1599 *
1600 * @returns true if write was done.
1601 * @returns false if write wasn't done.
1602 *
1603 * @param pu128 Pointer to the 128-bit variable to update.
1604 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1605 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1606 * @param u64OldHi The high 64-bit of the value to compare with.
1607 * @param u64OldLo The low 64-bit of the value to compare with.
1608 *
1609 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1610 */
1611# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1612DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1613 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1614# else
1615DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1616 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1617{
1618# if RT_INLINE_ASM_USES_INTRIN
1619 __int64 ai64Cmp[2];
1620 ai64Cmp[0] = u64OldLo;
1621 ai64Cmp[1] = u64OldHi;
1622 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1623
1624# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1625 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1626
1627# elif defined(RT_ARCH_AMD64)
1628# if RT_INLINE_ASM_GNU_STYLE
1629 uint64_t u64Ret;
1630 uint64_t u64Spill;
1631 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1632 "setz %%al\n\t"
1633 "movzbl %%al, %%eax\n\t"
1634 : "=a" (u64Ret)
1635 , "=d" (u64Spill)
1636 , "+m" (*pu128)
1637 : "a" (u64OldLo)
1638 , "d" (u64OldHi)
1639 , "b" (u64NewLo)
1640 , "c" (u64NewHi)
1641 : "cc");
1642
1643 return (bool)u64Ret;
1644# else
1645# error "Port me"
1646# endif
1647# else
1648# error "Port me"
1649# endif
1650}
1651# endif
1652
1653
1654/**
1655 * Atomically compare and write an unsigned 128-bit value, ordered.
1656 *
1657 * @returns true if write was done.
1658 * @returns false if write wasn't done.
1659 *
1660 * @param pu128 Pointer to the 128-bit variable to update.
1661 * @param u128New The 128-bit value to assign to *pu128.
1662 * @param u128Old The value to compare with.
1663 *
1664 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1665 */
1666DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1667{
1668# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1669# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1670 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1671# else
1672 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1673 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1674# endif
1675# else
1676 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1677# endif
1678}
1679
1680
1681/**
1682 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1683 */
1684DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1685 const RTUINT128U u128Old) RT_NOTHROW_DEF
1686{
1687# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1688 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1689# else
1690 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1691# endif
1692}
1693
1694#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1695
1696/**
1697 * Atomically Compare and Exchange a pointer value, ordered.
1698 *
1699 * @returns true if xchg was done.
1700 * @returns false if xchg wasn't done.
1701 *
1702 * @param ppv Pointer to the value to update.
1703 * @param pvNew The new value to assigned to *ppv.
1704 * @param pvOld The old value to *ppv compare with.
1705 *
1706 * @remarks x86: Requires a 486 or later.
1707 * @todo Rename ASMAtomicCmpWritePtrVoid
1708 */
1709DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1710{
1711#if ARCH_BITS == 32 || ARCH_BITS == 16
1712 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1713#elif ARCH_BITS == 64
1714 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1715#else
1716# error "ARCH_BITS is bogus"
1717#endif
1718}
1719
1720
1721/**
1722 * Atomically Compare and Exchange a pointer value, ordered.
1723 *
1724 * @returns true if xchg was done.
1725 * @returns false if xchg wasn't done.
1726 *
1727 * @param ppv Pointer to the value to update.
1728 * @param pvNew The new value to assigned to *ppv.
1729 * @param pvOld The old value to *ppv compare with.
1730 *
1731 * @remarks This is relatively type safe on GCC platforms.
1732 * @remarks x86: Requires a 486 or later.
1733 * @todo Rename ASMAtomicCmpWritePtr
1734 */
1735#ifdef __GNUC__
1736# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1737 __extension__ \
1738 ({\
1739 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1740 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1741 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1742 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1743 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1744 fMacroRet; \
1745 })
1746#else
1747# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1748 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1749#endif
1750
1751
1752/** @def ASMAtomicCmpXchgHandle
1753 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1754 *
1755 * @param ph Pointer to the value to update.
1756 * @param hNew The new value to assigned to *pu.
1757 * @param hOld The old value to *pu compare with.
1758 * @param fRc Where to store the result.
1759 *
1760 * @remarks This doesn't currently work for all handles (like RTFILE).
1761 * @remarks x86: Requires a 486 or later.
1762 * @todo Rename ASMAtomicCmpWriteHandle
1763 */
1764#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1765# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1766 do { \
1767 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1768 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1769 } while (0)
1770#elif HC_ARCH_BITS == 64
1771# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1772 do { \
1773 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1774 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1775 } while (0)
1776#else
1777# error HC_ARCH_BITS
1778#endif
1779
1780
1781/** @def ASMAtomicCmpXchgSize
1782 * Atomically Compare and Exchange a value which size might differ
1783 * between platforms or compilers, ordered.
1784 *
1785 * @param pu Pointer to the value to update.
1786 * @param uNew The new value to assigned to *pu.
1787 * @param uOld The old value to *pu compare with.
1788 * @param fRc Where to store the result.
1789 *
1790 * @remarks x86: Requires a 486 or later.
1791 * @todo Rename ASMAtomicCmpWriteSize
1792 */
1793#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1794 do { \
1795 switch (sizeof(*(pu))) { \
1796 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1797 break; \
1798 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1799 break; \
1800 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1801 (fRc) = false; \
1802 break; \
1803 } \
1804 } while (0)
1805
1806
1807/**
1808 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1809 * back old value, ordered.
1810 *
1811 * @returns true if xchg was done.
1812 * @returns false if xchg wasn't done.
1813 *
1814 * @param pu8 Pointer to the value to update.
1815 * @param u8New The new value to assigned to *pu32.
1816 * @param u8Old The old value to *pu8 compare with.
1817 * @param pu8Old Pointer store the old value at.
1818 *
1819 * @remarks x86: Requires a 486 or later.
1820 */
1821#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1822RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1823#else
1824DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1825{
1826# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1827# if RT_INLINE_ASM_GNU_STYLE
1828 uint8_t u8Ret;
1829 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1830 "setz %1\n\t"
1831 : "=m" (*pu8)
1832 , "=qm" (u8Ret)
1833 , "=a" (*pu8Old)
1834# if defined(RT_ARCH_X86)
1835 : "q" (u8New)
1836# else
1837 : "r" (u8New)
1838# endif
1839 , "a" (u8Old)
1840 , "m" (*pu8)
1841 : "cc");
1842 return (bool)u8Ret;
1843
1844# elif RT_INLINE_ASM_USES_INTRIN
1845 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1846
1847# else
1848 uint8_t u8Ret;
1849 __asm
1850 {
1851# ifdef RT_ARCH_AMD64
1852 mov rdx, [pu8]
1853# else
1854 mov edx, [pu8]
1855# endif
1856 mov eax, [u8Old]
1857 mov ecx, [u8New]
1858# ifdef RT_ARCH_AMD64
1859 lock cmpxchg [rdx], ecx
1860 mov rdx, [pu8Old]
1861 mov [rdx], eax
1862# else
1863 lock cmpxchg [edx], ecx
1864 mov edx, [pu8Old]
1865 mov [edx], eax
1866# endif
1867 setz al
1868 movzx eax, al
1869 mov [u8Ret], eax
1870 }
1871 return !!u8Ret;
1872# endif
1873
1874# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1875 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
1876 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
1877# if defined(RTASM_ARM64_USE_FEAT_LSE)
1878 union { uint32_t u; bool f; } fXchg;
1879 uint32_t u32Actual;
1880 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
1881# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1882 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1883# else
1884 RTASM_ARM_DMB_SY
1885 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1886# endif
1887 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1888 "cset %w[fXchg], eq\n\t"
1889 : [pMem] "+Q" (*pu8)
1890 , [uOldActual] "=&r" (u32Actual)
1891 , [fXchg] "=&r" (fXchg.u)
1892 : [uNew] "r" ((uint32_t)u8New)
1893 , [uOldOrg] "r" ((uint32_t)u8Old)
1894 , "[uOldActual]" ((uint32_t)u8Old)
1895 : "cc");
1896 *pu8Old = (uint8_t)u32Actual;
1897# else
1898 union { uint8_t u; bool f; } fXchg;
1899 uint8_t u8ActualOld;
1900 uint8_t rcSpill;
1901 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
1902 RTASM_ARM_DMB_SY
1903# if defined(RT_ARCH_ARM64)
1904 "ldaxrb %w[uOld], %[pMem]\n\t"
1905 "cmp %w[uOld], %w[uCmp]\n\t"
1906 "bne 1f\n\t" /* stop here if not equal */
1907 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1908 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1909 "mov %w[fXchg], #1\n\t"
1910 "1:\n\t"
1911 "clrex\n\t"
1912# else
1913 "ldrexb %[uOld], %[pMem]\n\t"
1914 "teq %[uOld], %[uCmp]\n\t"
1915 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1916 "bne 1f\n\t" /* stop here if not equal */
1917 "cmp %[rc], #0\n\t"
1918 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1919 "mov %[fXchg], #1\n\t"
1920 "1:\n\t"
1921 /** @todo clrexne on armv7? */
1922# endif
1923 : [pMem] "+Q" (*pu8)
1924 , [uOld] "=&r" (u8ActualOld)
1925 , [rc] "=&r" (rcSpill)
1926 , [fXchg] "=&r" (fXchg.u)
1927 : [uCmp] "r" (u8Old)
1928 , [uNew] "r" (u8New)
1929 , "[fXchg]" (0)
1930 RTASM_ARM_DMB_SY_COMMA_IN_REG
1931 : "cc");
1932 *pu8Old = u8ActualOld;
1933# endif
1934 return fXchg.f;
1935
1936# else
1937# error "Port me"
1938# endif
1939}
1940#endif
1941
1942
1943/**
1944 * Atomically Compare and Exchange a signed 8-bit value, additionally
1945 * passes back old value, ordered.
1946 *
1947 * @returns true if xchg was done.
1948 * @returns false if xchg wasn't done.
1949 *
1950 * @param pi8 Pointer to the value to update.
1951 * @param i8New The new value to assigned to *pi8.
1952 * @param i8Old The old value to *pi8 compare with.
1953 * @param pi8Old Pointer store the old value at.
1954 *
1955 * @remarks x86: Requires a 486 or later.
1956 */
1957DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
1958{
1959 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
1960}
1961
1962
1963/**
1964 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
1965 * back old value, ordered.
1966 *
1967 * @returns true if xchg was done.
1968 * @returns false if xchg wasn't done.
1969 *
1970 * @param pu16 Pointer to the value to update.
1971 * @param u16New The new value to assigned to *pu16.
1972 * @param u16Old The old value to *pu32 compare with.
1973 * @param pu16Old Pointer store the old value at.
1974 *
1975 * @remarks x86: Requires a 486 or later.
1976 */
1977#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1978RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
1979#else
1980DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
1981{
1982# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1983# if RT_INLINE_ASM_GNU_STYLE
1984 uint8_t u8Ret;
1985 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
1986 "setz %1\n\t"
1987 : "=m" (*pu16)
1988 , "=qm" (u8Ret)
1989 , "=a" (*pu16Old)
1990 : "r" (u16New)
1991 , "a" (u16Old)
1992 , "m" (*pu16)
1993 : "cc");
1994 return (bool)u8Ret;
1995
1996# elif RT_INLINE_ASM_USES_INTRIN
1997 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
1998
1999# else
2000 uint16_t u16Ret;
2001 __asm
2002 {
2003# ifdef RT_ARCH_AMD64
2004 mov rdx, [pu16]
2005# else
2006 mov edx, [pu16]
2007# endif
2008 mov eax, [u16Old]
2009 mov ecx, [u16New]
2010# ifdef RT_ARCH_AMD64
2011 lock cmpxchg [rdx], ecx
2012 mov rdx, [pu16Old]
2013 mov [rdx], eax
2014# else
2015 lock cmpxchg [edx], ecx
2016 mov edx, [pu16Old]
2017 mov [edx], eax
2018# endif
2019 setz al
2020 movzx eax, al
2021 mov [u16Ret], eax
2022 }
2023 return !!u16Ret;
2024# endif
2025
2026# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2027 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2028 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2029# if defined(RTASM_ARM64_USE_FEAT_LSE)
2030 union { uint32_t u; bool f; } fXchg;
2031 uint32_t u32Actual;
2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2033# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2034 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2035# else
2036 RTASM_ARM_DMB_SY
2037 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2038# endif
2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2040 "cset %w[fXchg], eq\n\t"
2041 : [pMem] "+Q" (*pu16)
2042 , [uOldActual] "=&r" (u32Actual)
2043 , [fXchg] "=&r" (fXchg.u)
2044 : [uNew] "r" ((uint32_t)u16New)
2045 , [uOldOrg] "r" ((uint32_t)u16Old)
2046 , "[uOldActual]" ((uint32_t)u16Old)
2047 : "cc");
2048 *pu16Old = (uint16_t)u32Actual;
2049# else
2050 union { uint16_t u; bool f; } fXchg;
2051 uint16_t u16ActualOld;
2052 uint16_t rcSpill;
2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2054 RTASM_ARM_DMB_SY
2055# if defined(RT_ARCH_ARM64)
2056 "ldaxrh %w[uOld], %[pMem]\n\t"
2057 "cmp %w[uOld], %w[uCmp]\n\t"
2058 "bne 1f\n\t" /* stop here if not equal */
2059 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2060 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2061 "mov %w[fXchg], #1\n\t"
2062 "1:\n\t"
2063 "clrex\n\t"
2064# else
2065 "ldrexh %[uOld], %[pMem]\n\t"
2066 "teq %[uOld], %[uCmp]\n\t"
2067 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2068 "bne 1f\n\t" /* stop here if not equal */
2069 "cmp %[rc], #0\n\t"
2070 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2071 "mov %[fXchg], #1\n\t"
2072 "1:\n\t"
2073 /** @todo clrexne on armv7? */
2074# endif
2075 : [pMem] "+Q" (*pu16)
2076 , [uOld] "=&r" (u16ActualOld)
2077 , [rc] "=&r" (rcSpill)
2078 , [fXchg] "=&r" (fXchg.u)
2079 : [uCmp] "r" (u16Old)
2080 , [uNew] "r" (u16New)
2081 , "[fXchg]" (0)
2082 RTASM_ARM_DMB_SY_COMMA_IN_REG
2083 : "cc");
2084 *pu16Old = u16ActualOld;
2085# endif
2086 return fXchg.f;
2087
2088# else
2089# error "Port me"
2090# endif
2091}
2092#endif
2093
2094
2095/**
2096 * Atomically Compare and Exchange a signed 16-bit value, additionally
2097 * passes back old value, ordered.
2098 *
2099 * @returns true if xchg was done.
2100 * @returns false if xchg wasn't done.
2101 *
2102 * @param pi16 Pointer to the value to update.
2103 * @param i16New The new value to assigned to *pi16.
2104 * @param i16Old The old value to *pi16 compare with.
2105 * @param pi16Old Pointer store the old value at.
2106 *
2107 * @remarks x86: Requires a 486 or later.
2108 */
2109DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2110{
2111 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2112}
2113
2114
2115/**
2116 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2117 * passes back old value, ordered.
2118 *
2119 * @returns true if xchg was done.
2120 * @returns false if xchg wasn't done.
2121 *
2122 * @param pu32 Pointer to the value to update.
2123 * @param u32New The new value to assigned to *pu32.
2124 * @param u32Old The old value to *pu32 compare with.
2125 * @param pu32Old Pointer store the old value at.
2126 *
2127 * @remarks x86: Requires a 486 or later.
2128 */
2129#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2130RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2131#else
2132DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2133{
2134# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2135# if RT_INLINE_ASM_GNU_STYLE
2136 uint8_t u8Ret;
2137 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2138 "setz %1\n\t"
2139 : "=m" (*pu32)
2140 , "=qm" (u8Ret)
2141 , "=a" (*pu32Old)
2142 : "r" (u32New)
2143 , "a" (u32Old)
2144 , "m" (*pu32)
2145 : "cc");
2146 return (bool)u8Ret;
2147
2148# elif RT_INLINE_ASM_USES_INTRIN
2149 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2150
2151# else
2152 uint32_t u32Ret;
2153 __asm
2154 {
2155# ifdef RT_ARCH_AMD64
2156 mov rdx, [pu32]
2157# else
2158 mov edx, [pu32]
2159# endif
2160 mov eax, [u32Old]
2161 mov ecx, [u32New]
2162# ifdef RT_ARCH_AMD64
2163 lock cmpxchg [rdx], ecx
2164 mov rdx, [pu32Old]
2165 mov [rdx], eax
2166# else
2167 lock cmpxchg [edx], ecx
2168 mov edx, [pu32Old]
2169 mov [edx], eax
2170# endif
2171 setz al
2172 movzx eax, al
2173 mov [u32Ret], eax
2174 }
2175 return !!u32Ret;
2176# endif
2177
2178# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2179 union { uint32_t u; bool f; } fXchg;
2180 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2181 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2182# if defined(RTASM_ARM64_USE_FEAT_LSE)
2183 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2184# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2185 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2186# else
2187 RTASM_ARM_DMB_SY
2188 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2189# endif
2190 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2191 "cset %w[fXchg], eq\n\t"
2192 : [pMem] "+Q" (*pu32)
2193 , [uOldActual] "=&r" (*pu32Old)
2194 , [fXchg] "=&r" (fXchg.u)
2195 : [uNew] "r" (u32New)
2196 , [uOldOrg] "r" (u32Old)
2197 , "[uOldActual]" (u32Old)
2198 : "cc");
2199# else
2200 uint32_t u32ActualOld;
2201 uint32_t rcSpill;
2202 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2203 RTASM_ARM_DMB_SY
2204# if defined(RT_ARCH_ARM64)
2205 "ldaxr %w[uOld], %[pMem]\n\t"
2206 "cmp %w[uOld], %w[uCmp]\n\t"
2207 "bne 1f\n\t" /* stop here if not equal */
2208 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2209 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2210 "mov %w[fXchg], #1\n\t"
2211 "1:\n\t"
2212 "clrex\n\t"
2213# else
2214 "ldrex %[uOld], %[pMem]\n\t"
2215 "teq %[uOld], %[uCmp]\n\t"
2216 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2217 "bne 1f\n\t" /* stop here if not equal */
2218 "cmp %[rc], #0\n\t"
2219 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2220 "mov %[fXchg], #1\n\t"
2221 "1:\n\t"
2222 /** @todo clrexne on armv7? */
2223# endif
2224 : [pMem] "+Q" (*pu32)
2225 , [uOld] "=&r" (u32ActualOld)
2226 , [rc] "=&r" (rcSpill)
2227 , [fXchg] "=&r" (fXchg.u)
2228 : [uCmp] "r" (u32Old)
2229 , [uNew] "r" (u32New)
2230 , "[fXchg]" (0)
2231 RTASM_ARM_DMB_SY_COMMA_IN_REG
2232 : "cc");
2233 *pu32Old = u32ActualOld;
2234# endif
2235 return fXchg.f;
2236
2237# else
2238# error "Port me"
2239# endif
2240}
2241#endif
2242
2243
2244/**
2245 * Atomically Compare and Exchange a signed 32-bit value, additionally
2246 * passes back old value, ordered.
2247 *
2248 * @returns true if xchg was done.
2249 * @returns false if xchg wasn't done.
2250 *
2251 * @param pi32 Pointer to the value to update.
2252 * @param i32New The new value to assigned to *pi32.
2253 * @param i32Old The old value to *pi32 compare with.
2254 * @param pi32Old Pointer store the old value at.
2255 *
2256 * @remarks x86: Requires a 486 or later.
2257 */
2258DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2259{
2260 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2261}
2262
2263
2264/**
2265 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2266 * passing back old value, ordered.
2267 *
2268 * @returns true if xchg was done.
2269 * @returns false if xchg wasn't done.
2270 *
2271 * @param pu64 Pointer to the 64-bit variable to update.
2272 * @param u64New The 64-bit value to assign to *pu64.
2273 * @param u64Old The value to compare with.
2274 * @param pu64Old Pointer store the old value at.
2275 *
2276 * @remarks x86: Requires a Pentium or later.
2277 */
2278#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2279 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2280RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2281#else
2282DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2283{
2284# if RT_INLINE_ASM_USES_INTRIN
2285 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2286
2287# elif defined(RT_ARCH_AMD64)
2288# if RT_INLINE_ASM_GNU_STYLE
2289 uint8_t u8Ret;
2290 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2291 "setz %1\n\t"
2292 : "=m" (*pu64)
2293 , "=qm" (u8Ret)
2294 , "=a" (*pu64Old)
2295 : "r" (u64New)
2296 , "a" (u64Old)
2297 , "m" (*pu64)
2298 : "cc");
2299 return (bool)u8Ret;
2300# else
2301 bool fRet;
2302 __asm
2303 {
2304 mov rdx, [pu32]
2305 mov rax, [u64Old]
2306 mov rcx, [u64New]
2307 lock cmpxchg [rdx], rcx
2308 mov rdx, [pu64Old]
2309 mov [rdx], rax
2310 setz al
2311 mov [fRet], al
2312 }
2313 return fRet;
2314# endif
2315
2316# elif defined(RT_ARCH_X86)
2317# if RT_INLINE_ASM_GNU_STYLE
2318 uint64_t u64Ret;
2319# if defined(PIC) || defined(__PIC__)
2320 /* Note #1: This code uses a memory clobber description, because the clean
2321 solution with an output value for *pu64 makes gcc run out of
2322 registers. This will cause suboptimal code, and anyone with a
2323 better solution is welcome to improve this.
2324
2325 Note #2: We must prevent gcc from encoding the memory access, as it
2326 may go via the GOT if we're working on a global variable (like
2327 in the testcase). Thus we request a register (%3) and
2328 dereference it ourselves. */
2329 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2330 "lock; cmpxchg8b (%3)\n\t"
2331 "xchgl %%ebx, %1\n\t"
2332 : "=A" (u64Ret)
2333 : "DS" ((uint32_t)u64New)
2334 , "c" ((uint32_t)(u64New >> 32))
2335 , "r" (pu64) /* Do not use "m" here*/
2336 , "0" (u64Old)
2337 : "memory"
2338 , "cc" );
2339# else /* !PIC */
2340 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2341 : "=A" (u64Ret)
2342 , "=m" (*pu64)
2343 : "b" ((uint32_t)u64New)
2344 , "c" ((uint32_t)(u64New >> 32))
2345 , "m" (*pu64)
2346 , "0" (u64Old)
2347 : "cc");
2348# endif
2349 *pu64Old = u64Ret;
2350 return u64Ret == u64Old;
2351# else
2352 uint32_t u32Ret;
2353 __asm
2354 {
2355 mov ebx, dword ptr [u64New]
2356 mov ecx, dword ptr [u64New + 4]
2357 mov edi, [pu64]
2358 mov eax, dword ptr [u64Old]
2359 mov edx, dword ptr [u64Old + 4]
2360 lock cmpxchg8b [edi]
2361 mov ebx, [pu64Old]
2362 mov [ebx], eax
2363 setz al
2364 movzx eax, al
2365 add ebx, 4
2366 mov [ebx], edx
2367 mov dword ptr [u32Ret], eax
2368 }
2369 return !!u32Ret;
2370# endif
2371
2372# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2373 union { uint32_t u; bool f; } fXchg;
2374 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2375 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2376# if defined(RTASM_ARM64_USE_FEAT_LSE)
2377 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2378# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2379 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2380# else
2381 RTASM_ARM_DMB_SY
2382 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2383# endif
2384 "cmp %[uOldActual], %[uOldOrg]\n\t"
2385 "cset %w[fXchg], eq\n\t"
2386 : [pMem] "+Q" (*pu64)
2387 , [uOldActual] "=&r" (*pu64Old)
2388 , [fXchg] "=&r" (fXchg.u)
2389 : [uNew] "r" (u64New)
2390 , [uOldOrg] "r" (u64Old)
2391 , "[uOldActual]" (u64Old)
2392 : "cc");
2393# else
2394 uint64_t u64ActualOld;
2395 uint32_t rcSpill;
2396 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2397 RTASM_ARM_DMB_SY
2398# if defined(RT_ARCH_ARM64)
2399 "ldaxr %[uOld], %[pMem]\n\t"
2400 "cmp %[uOld], %[uCmp]\n\t"
2401 "bne 1f\n\t" /* stop here if not equal */
2402 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2403 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2404 "mov %w[fXchg], #1\n\t"
2405 "1:\n\t"
2406 "clrex\n\t"
2407# else
2408 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2409 "teq %[uOld], %[uCmp]\n\t"
2410 "teqeq %H[uOld], %H[uCmp]\n\t"
2411 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2412 "bne 1f\n\t" /* stop here if not equal */
2413 "cmp %[rc], #0\n\t"
2414 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2415 "mov %[fXchg], #1\n\t"
2416 "1:\n\t"
2417 /** @todo clrexne on armv7? */
2418# endif
2419 : [pMem] "+Q" (*pu64)
2420 , [uOld] "=&r" (u64ActualOld)
2421 , [rc] "=&r" (rcSpill)
2422 , [fXchg] "=&r" (fXchg.u)
2423 : [uCmp] "r" (u64Old)
2424 , [uNew] "r" (u64New)
2425 , "[fXchg]" (0)
2426 RTASM_ARM_DMB_SY_COMMA_IN_REG
2427 : "cc");
2428 *pu64Old = u64ActualOld;
2429# endif
2430 return fXchg.f;
2431
2432# else
2433# error "Port me"
2434# endif
2435}
2436#endif
2437
2438
2439/**
2440 * Atomically Compare and exchange a signed 64-bit value, additionally
2441 * passing back old value, ordered.
2442 *
2443 * @returns true if xchg was done.
2444 * @returns false if xchg wasn't done.
2445 *
2446 * @param pi64 Pointer to the 64-bit variable to update.
2447 * @param i64 The 64-bit value to assign to *pu64.
2448 * @param i64Old The value to compare with.
2449 * @param pi64Old Pointer store the old value at.
2450 *
2451 * @remarks x86: Requires a Pentium or later.
2452 */
2453DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2454{
2455 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2456}
2457
2458#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2459
2460/** @def RTASM_HAVE_CMP_XCHG_U128
2461 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2462 * and ASMAtomicCmpSwapExU128() available. */
2463# define RTASM_HAVE_CMP_XCHG_U128 1
2464
2465
2466/**
2467 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2468 *
2469 * @returns true if exchange was done.
2470 * @returns false if exchange wasn't done.
2471 *
2472 * @param pu128 Pointer to the 128-bit variable to update.
2473 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2474 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2475 * @param u64OldHi The high 64-bit of the value to compare with.
2476 * @param u64OldLo The low 64-bit of the value to compare with.
2477 * @param pu128Old Where to return the old value.
2478 *
2479 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2480 */
2481# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2482DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2483 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2484# else
2485DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2486 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2487{
2488# if RT_INLINE_ASM_USES_INTRIN
2489 pu128Old->Hi = u64OldHi;
2490 pu128Old->Lo = u64OldLo;
2491 AssertCompileMemberOffset(uint128_t, Lo, 0);
2492 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2493
2494# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2495 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2496 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2497 *pu128Old = uOld;
2498 return uCmp == uOld;
2499
2500# elif defined(RT_ARCH_AMD64)
2501# if RT_INLINE_ASM_GNU_STYLE
2502 uint8_t bRet;
2503 uint64_t u64RetHi, u64RetLo;
2504 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2505 "setz %b0\n\t"
2506 : "=r" (bRet)
2507 , "=a" (u64RetLo)
2508 , "=d" (u64RetHi)
2509 , "+m" (*pu128)
2510 : "a" (u64OldLo)
2511 , "d" (u64OldHi)
2512 , "b" (u64NewLo)
2513 , "c" (u64NewHi)
2514 : "cc");
2515 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2516 return (bool)bRet;
2517# else
2518# error "Port me"
2519# endif
2520# else
2521# error "Port me"
2522# endif
2523}
2524# endif
2525
2526
2527/**
2528 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2529 *
2530 * @returns true if exchange was done.
2531 * @returns false if exchange wasn't done.
2532 *
2533 * @param pu128 Pointer to the 128-bit variable to update.
2534 * @param u128New The 128-bit value to assign to *pu128.
2535 * @param u128Old The value to compare with.
2536 * @param pu128Old Where to return the old value.
2537 *
2538 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2539 */
2540DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2541 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2542{
2543# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2544# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2545 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2546 *pu128Old = uSwapped;
2547 return uSwapped == u128Old;
2548# else
2549 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2550 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2551# endif
2552# else
2553 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2554# endif
2555}
2556
2557
2558/**
2559 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2560 */
2561DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2562 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2563{
2564# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2565 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2566# else
2567 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2568# endif
2569}
2570
2571#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2572
2573
2574
2575/** @def ASMAtomicCmpXchgExHandle
2576 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2577 *
2578 * @param ph Pointer to the value to update.
2579 * @param hNew The new value to assigned to *pu.
2580 * @param hOld The old value to *pu compare with.
2581 * @param fRc Where to store the result.
2582 * @param phOldVal Pointer to where to store the old value.
2583 *
2584 * @remarks This doesn't currently work for all handles (like RTFILE).
2585 */
2586#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2587# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2588 do { \
2589 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2590 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2591 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2592 } while (0)
2593#elif HC_ARCH_BITS == 64
2594# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2595 do { \
2596 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2597 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2598 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2599 } while (0)
2600#else
2601# error HC_ARCH_BITS
2602#endif
2603
2604
2605/** @def ASMAtomicCmpXchgExSize
2606 * Atomically Compare and Exchange a value which size might differ
2607 * between platforms or compilers. Additionally passes back old value.
2608 *
2609 * @param pu Pointer to the value to update.
2610 * @param uNew The new value to assigned to *pu.
2611 * @param uOld The old value to *pu compare with.
2612 * @param fRc Where to store the result.
2613 * @param puOldVal Pointer to where to store the old value.
2614 *
2615 * @remarks x86: Requires a 486 or later.
2616 */
2617#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2618 do { \
2619 switch (sizeof(*(pu))) { \
2620 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2621 break; \
2622 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2623 break; \
2624 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2625 (fRc) = false; \
2626 (uOldVal) = 0; \
2627 break; \
2628 } \
2629 } while (0)
2630
2631
2632/**
2633 * Atomically Compare and Exchange a pointer value, additionally
2634 * passing back old value, ordered.
2635 *
2636 * @returns true if xchg was done.
2637 * @returns false if xchg wasn't done.
2638 *
2639 * @param ppv Pointer to the value to update.
2640 * @param pvNew The new value to assigned to *ppv.
2641 * @param pvOld The old value to *ppv compare with.
2642 * @param ppvOld Pointer store the old value at.
2643 *
2644 * @remarks x86: Requires a 486 or later.
2645 */
2646DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2647 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2648{
2649#if ARCH_BITS == 32 || ARCH_BITS == 16
2650 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2651#elif ARCH_BITS == 64
2652 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2653#else
2654# error "ARCH_BITS is bogus"
2655#endif
2656}
2657
2658
2659/**
2660 * Atomically Compare and Exchange a pointer value, additionally
2661 * passing back old value, ordered.
2662 *
2663 * @returns true if xchg was done.
2664 * @returns false if xchg wasn't done.
2665 *
2666 * @param ppv Pointer to the value to update.
2667 * @param pvNew The new value to assigned to *ppv.
2668 * @param pvOld The old value to *ppv compare with.
2669 * @param ppvOld Pointer store the old value at.
2670 *
2671 * @remarks This is relatively type safe on GCC platforms.
2672 * @remarks x86: Requires a 486 or later.
2673 */
2674#ifdef __GNUC__
2675# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2676 __extension__ \
2677 ({\
2678 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2679 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2680 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2681 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2682 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2683 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2684 (void **)ppvOldTypeChecked); \
2685 fMacroRet; \
2686 })
2687#else
2688# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2689 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2690#endif
2691
2692
2693/**
2694 * Virtualization unfriendly serializing instruction, always exits.
2695 */
2696#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2697RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2698#else
2699DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2700{
2701# if RT_INLINE_ASM_GNU_STYLE
2702 RTCCUINTREG xAX = 0;
2703# ifdef RT_ARCH_AMD64
2704 __asm__ __volatile__ ("cpuid"
2705 : "=a" (xAX)
2706 : "0" (xAX)
2707 : "rbx", "rcx", "rdx", "memory");
2708# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2709 __asm__ __volatile__ ("push %%ebx\n\t"
2710 "cpuid\n\t"
2711 "pop %%ebx\n\t"
2712 : "=a" (xAX)
2713 : "0" (xAX)
2714 : "ecx", "edx", "memory");
2715# else
2716 __asm__ __volatile__ ("cpuid"
2717 : "=a" (xAX)
2718 : "0" (xAX)
2719 : "ebx", "ecx", "edx", "memory");
2720# endif
2721
2722# elif RT_INLINE_ASM_USES_INTRIN
2723 int aInfo[4];
2724 _ReadWriteBarrier();
2725 __cpuid(aInfo, 0);
2726
2727# else
2728 __asm
2729 {
2730 push ebx
2731 xor eax, eax
2732 cpuid
2733 pop ebx
2734 }
2735# endif
2736}
2737#endif
2738
2739/**
2740 * Virtualization friendly serializing instruction, though more expensive.
2741 */
2742#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2743RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2744#else
2745DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2746{
2747# if RT_INLINE_ASM_GNU_STYLE
2748# ifdef RT_ARCH_AMD64
2749 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2750 "subq $128, %%rsp\n\t" /*redzone*/
2751 "mov %%ss, %%eax\n\t"
2752 "pushq %%rax\n\t"
2753 "pushq %%r10\n\t"
2754 "pushfq\n\t"
2755 "movl %%cs, %%eax\n\t"
2756 "pushq %%rax\n\t"
2757 "leaq 1f(%%rip), %%rax\n\t"
2758 "pushq %%rax\n\t"
2759 "iretq\n\t"
2760 "1:\n\t"
2761 ::: "rax", "r10", "memory", "cc");
2762# else
2763 __asm__ __volatile__ ("pushfl\n\t"
2764 "pushl %%cs\n\t"
2765 "pushl $1f\n\t"
2766 "iretl\n\t"
2767 "1:\n\t"
2768 ::: "memory");
2769# endif
2770
2771# else
2772 __asm
2773 {
2774 pushfd
2775 push cs
2776 push la_ret
2777 iretd
2778 la_ret:
2779 }
2780# endif
2781}
2782#endif
2783
2784/**
2785 * Virtualization friendlier serializing instruction, may still cause exits.
2786 */
2787#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2788RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2789#else
2790DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2791{
2792# if RT_INLINE_ASM_GNU_STYLE
2793 /* rdtscp is not supported by ancient linux build VM of course :-( */
2794# ifdef RT_ARCH_AMD64
2795 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2796 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2797# else
2798 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2799 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2800# endif
2801# else
2802# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2803 uint32_t uIgnore;
2804 _ReadWriteBarrier();
2805 (void)__rdtscp(&uIgnore);
2806 (void)uIgnore;
2807# else
2808 __asm
2809 {
2810 rdtscp
2811 }
2812# endif
2813# endif
2814}
2815#endif
2816
2817
2818/**
2819 * Serialize Instruction (both data store and instruction flush).
2820 */
2821#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2822# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2823#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2824# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
2825#elif defined(RT_ARCH_SPARC64)
2826RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
2827#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2828DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
2829{
2830 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
2831}
2832#else
2833# error "Port me"
2834#endif
2835
2836
2837/**
2838 * Memory fence, waits for any pending writes and reads to complete.
2839 * @note No implicit compiler barrier (which is probably stupid).
2840 */
2841DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
2842{
2843#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2844# if RT_INLINE_ASM_GNU_STYLE
2845 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
2846# elif RT_INLINE_ASM_USES_INTRIN
2847 _mm_mfence();
2848# else
2849 __asm
2850 {
2851 _emit 0x0f
2852 _emit 0xae
2853 _emit 0xf0
2854 }
2855# endif
2856#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2857 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
2858#elif ARCH_BITS == 16
2859 uint16_t volatile u16;
2860 ASMAtomicXchgU16(&u16, 0);
2861#else
2862 uint32_t volatile u32;
2863 ASMAtomicXchgU32(&u32, 0);
2864#endif
2865}
2866
2867
2868/**
2869 * Write fence, waits for any pending writes to complete.
2870 * @note No implicit compiler barrier (which is probably stupid).
2871 */
2872DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
2873{
2874#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2875# if RT_INLINE_ASM_GNU_STYLE
2876 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
2877# elif RT_INLINE_ASM_USES_INTRIN
2878 _mm_sfence();
2879# else
2880 __asm
2881 {
2882 _emit 0x0f
2883 _emit 0xae
2884 _emit 0xf8
2885 }
2886# endif
2887#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2888 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
2889#else
2890 ASMMemoryFence();
2891#endif
2892}
2893
2894
2895/**
2896 * Read fence, waits for any pending reads to complete.
2897 * @note No implicit compiler barrier (which is probably stupid).
2898 */
2899DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
2900{
2901#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2902# if RT_INLINE_ASM_GNU_STYLE
2903 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
2904# elif RT_INLINE_ASM_USES_INTRIN
2905 _mm_lfence();
2906# else
2907 __asm
2908 {
2909 _emit 0x0f
2910 _emit 0xae
2911 _emit 0xe8
2912 }
2913# endif
2914#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2915 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
2916#else
2917 ASMMemoryFence();
2918#endif
2919}
2920
2921
2922/**
2923 * Atomically reads an unsigned 8-bit value, ordered.
2924 *
2925 * @returns Current *pu8 value
2926 * @param pu8 Pointer to the 8-bit variable to read.
2927 */
2928DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2929{
2930#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2931 uint32_t u32;
2932# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
2933 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2934 RTASM_ARM_DMB_SY
2935 "casab %w[uDst], wzr, %[pMem]\n\t"
2936 : [uDst] "=&r" (u32)
2937 : [pMem] "Q" (*pu8),
2938 "0" (0)
2939 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2940# else
2941 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2942 RTASM_ARM_DMB_SY
2943# if defined(RT_ARCH_ARM64)
2944# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
2945 "ldurb %w[uDst], %[pMem]\n\t"
2946# else
2947 "ldxrb %w[uDst], %[pMem]\n\t"
2948 "clrex\n\t"
2949# endif
2950# else
2951 "ldrexb %[uDst], %[pMem]\n\t"
2952 /** @todo clrex */
2953# endif
2954 : [uDst] "=&r" (u32)
2955 : [pMem] "Q" (*pu8)
2956 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2957# endif
2958 return (uint8_t)u32;
2959#else
2960 ASMMemoryFence();
2961 return *pu8; /* byte reads are atomic on x86 */
2962#endif
2963}
2964
2965
2966/**
2967 * Atomically reads an unsigned 8-bit value, unordered.
2968 *
2969 * @returns Current *pu8 value
2970 * @param pu8 Pointer to the 8-bit variable to read.
2971 */
2972DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2973{
2974#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2975 uint32_t u32;
2976 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
2977# if defined(RT_ARCH_ARM64)
2978 "ldurb %w[uDst], %[pMem]\n\t"
2979# else
2980 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
2981# endif
2982 : [uDst] "=&r" (u32)
2983 : [pMem] "Q" (*pu8));
2984 return (uint8_t)u32;
2985#else
2986 return *pu8; /* byte reads are atomic on x86 */
2987#endif
2988}
2989
2990
2991/**
2992 * Atomically reads a signed 8-bit value, ordered.
2993 *
2994 * @returns Current *pi8 value
2995 * @param pi8 Pointer to the 8-bit variable to read.
2996 */
2997DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
2998{
2999#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3000 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3001#else
3002 ASMMemoryFence();
3003 return *pi8; /* byte reads are atomic on x86 */
3004#endif
3005}
3006
3007
3008/**
3009 * Atomically reads a signed 8-bit value, unordered.
3010 *
3011 * @returns Current *pi8 value
3012 * @param pi8 Pointer to the 8-bit variable to read.
3013 */
3014DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3015{
3016#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3017 int32_t i32;
3018 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3019# if defined(RT_ARCH_ARM64)
3020 "ldurb %w[iDst], %[pMem]\n\t"
3021# else
3022 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3023# endif
3024 : [iDst] "=&r" (i32)
3025 : [pMem] "Q" (*pi8));
3026 return (int8_t)i32;
3027#else
3028 return *pi8; /* byte reads are atomic on x86 */
3029#endif
3030}
3031
3032
3033/**
3034 * Atomically reads an unsigned 16-bit value, ordered.
3035 *
3036 * @returns Current *pu16 value
3037 * @param pu16 Pointer to the 16-bit variable to read.
3038 */
3039DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3040{
3041 Assert(!((uintptr_t)pu16 & 1));
3042#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3043 uint32_t u32;
3044# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3045 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3046 RTASM_ARM_DMB_SY
3047 "casah %w[uDst], wzr, %[pMem]\n\t"
3048 : [uDst] "=&r" (u32)
3049 : [pMem] "Q" (*pu16),
3050 "0" (0)
3051 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3052# else
3053 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3054 RTASM_ARM_DMB_SY
3055# if defined(RT_ARCH_ARM64)
3056# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3057 "ldurh %w[uDst], %[pMem]\n\t"
3058# else
3059 "ldxrh %w[uDst], %[pMem]\n\t"
3060 "clrex\n\t"
3061# endif
3062# else
3063 "ldrexh %[uDst], %[pMem]\n\t"
3064 /** @todo clrex */
3065# endif
3066 : [uDst] "=&r" (u32)
3067 : [pMem] "Q" (*pu16)
3068 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3069# endif
3070 return (uint16_t)u32;
3071#else
3072 ASMMemoryFence();
3073 return *pu16;
3074#endif
3075}
3076
3077
3078/**
3079 * Atomically reads an unsigned 16-bit value, unordered.
3080 *
3081 * @returns Current *pu16 value
3082 * @param pu16 Pointer to the 16-bit variable to read.
3083 */
3084DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3085{
3086 Assert(!((uintptr_t)pu16 & 1));
3087#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3088 uint32_t u32;
3089 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3090# if defined(RT_ARCH_ARM64)
3091 "ldurh %w[uDst], %[pMem]\n\t"
3092# else
3093 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3094# endif
3095 : [uDst] "=&r" (u32)
3096 : [pMem] "Q" (*pu16));
3097 return (uint16_t)u32;
3098#else
3099 return *pu16;
3100#endif
3101}
3102
3103
3104/**
3105 * Atomically reads a signed 16-bit value, ordered.
3106 *
3107 * @returns Current *pi16 value
3108 * @param pi16 Pointer to the 16-bit variable to read.
3109 */
3110DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3111{
3112 Assert(!((uintptr_t)pi16 & 1));
3113#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3114 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3115#else
3116 ASMMemoryFence();
3117 return *pi16;
3118#endif
3119}
3120
3121
3122/**
3123 * Atomically reads a signed 16-bit value, unordered.
3124 *
3125 * @returns Current *pi16 value
3126 * @param pi16 Pointer to the 16-bit variable to read.
3127 */
3128DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3129{
3130 Assert(!((uintptr_t)pi16 & 1));
3131#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3132 int32_t i32;
3133 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3134# if defined(RT_ARCH_ARM64)
3135 "ldurh %w[iDst], %[pMem]\n\t"
3136# else
3137 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3138# endif
3139 : [iDst] "=&r" (i32)
3140 : [pMem] "Q" (*pi16));
3141 return (int16_t)i32;
3142#else
3143 return *pi16;
3144#endif
3145}
3146
3147
3148/**
3149 * Atomically reads an unsigned 32-bit value, ordered.
3150 *
3151 * @returns Current *pu32 value
3152 * @param pu32 Pointer to the 32-bit variable to read.
3153 */
3154DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3155{
3156 Assert(!((uintptr_t)pu32 & 3));
3157#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3158 uint32_t u32;
3159# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3160 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3161 RTASM_ARM_DMB_SY
3162 "casa %w[uDst], wzr, %[pMem]\n\t"
3163 : [uDst] "=&r" (u32)
3164 : [pMem] "Q" (*pu32),
3165 "0" (0)
3166 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3167# else
3168 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3169 RTASM_ARM_DMB_SY
3170# if defined(RT_ARCH_ARM64)
3171# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3172 "ldur %w[uDst], %[pMem]\n\t"
3173# else
3174 "ldxr %w[uDst], %[pMem]\n\t"
3175 "clrex\n\t"
3176# endif
3177# else
3178 "ldrex %[uDst], %[pMem]\n\t"
3179 /** @todo clrex */
3180# endif
3181 : [uDst] "=&r" (u32)
3182 : [pMem] "Q" (*pu32)
3183 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3184# endif
3185 return u32;
3186#else
3187 ASMMemoryFence();
3188# if ARCH_BITS == 16
3189 AssertFailed(); /** @todo 16-bit */
3190# endif
3191 return *pu32;
3192#endif
3193}
3194
3195
3196/**
3197 * Atomically reads an unsigned 32-bit value, unordered.
3198 *
3199 * @returns Current *pu32 value
3200 * @param pu32 Pointer to the 32-bit variable to read.
3201 */
3202DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3203{
3204 Assert(!((uintptr_t)pu32 & 3));
3205#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3206 uint32_t u32;
3207 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3208# if defined(RT_ARCH_ARM64)
3209 "ldur %w[uDst], %[pMem]\n\t"
3210# else
3211 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3212# endif
3213 : [uDst] "=&r" (u32)
3214 : [pMem] "Q" (*pu32));
3215 return u32;
3216#else
3217# if ARCH_BITS == 16
3218 AssertFailed(); /** @todo 16-bit */
3219# endif
3220 return *pu32;
3221#endif
3222}
3223
3224
3225/**
3226 * Atomically reads a signed 32-bit value, ordered.
3227 *
3228 * @returns Current *pi32 value
3229 * @param pi32 Pointer to the 32-bit variable to read.
3230 */
3231DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3232{
3233 Assert(!((uintptr_t)pi32 & 3));
3234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3235 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3236#else
3237 ASMMemoryFence();
3238# if ARCH_BITS == 16
3239 AssertFailed(); /** @todo 16-bit */
3240# endif
3241 return *pi32;
3242#endif
3243}
3244
3245
3246/**
3247 * Atomically reads a signed 32-bit value, unordered.
3248 *
3249 * @returns Current *pi32 value
3250 * @param pi32 Pointer to the 32-bit variable to read.
3251 */
3252DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3253{
3254 Assert(!((uintptr_t)pi32 & 3));
3255#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3256 int32_t i32;
3257 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3258# if defined(RT_ARCH_ARM64)
3259 "ldur %w[iDst], %[pMem]\n\t"
3260# else
3261 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3262# endif
3263 : [iDst] "=&r" (i32)
3264 : [pMem] "Q" (*pi32));
3265 return i32;
3266
3267#else
3268# if ARCH_BITS == 16
3269 AssertFailed(); /** @todo 16-bit */
3270# endif
3271 return *pi32;
3272#endif
3273}
3274
3275
3276/**
3277 * Atomically reads an unsigned 64-bit value, ordered.
3278 *
3279 * @returns Current *pu64 value
3280 * @param pu64 Pointer to the 64-bit variable to read.
3281 * The memory pointed to must be writable.
3282 *
3283 * @remarks This may fault if the memory is read-only!
3284 * @remarks x86: Requires a Pentium or later.
3285 */
3286#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3287 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3288RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3289#else
3290DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3291{
3292 uint64_t u64;
3293# ifdef RT_ARCH_AMD64
3294 Assert(!((uintptr_t)pu64 & 7));
3295/*# if RT_INLINE_ASM_GNU_STYLE
3296 __asm__ __volatile__( "mfence\n\t"
3297 "movq %1, %0\n\t"
3298 : "=r" (u64)
3299 : "m" (*pu64));
3300# else
3301 __asm
3302 {
3303 mfence
3304 mov rdx, [pu64]
3305 mov rax, [rdx]
3306 mov [u64], rax
3307 }
3308# endif*/
3309 ASMMemoryFence();
3310 u64 = *pu64;
3311
3312# elif defined(RT_ARCH_X86)
3313# if RT_INLINE_ASM_GNU_STYLE
3314# if defined(PIC) || defined(__PIC__)
3315 uint32_t u32EBX = 0;
3316 Assert(!((uintptr_t)pu64 & 7));
3317 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3318 "lock; cmpxchg8b (%5)\n\t"
3319 "movl %3, %%ebx\n\t"
3320 : "=A" (u64)
3321# if RT_GNUC_PREREQ(4, 3)
3322 , "+m" (*pu64)
3323# else
3324 , "=m" (*pu64)
3325# endif
3326 : "0" (0ULL)
3327 , "m" (u32EBX)
3328 , "c" (0)
3329 , "S" (pu64)
3330 : "cc");
3331# else /* !PIC */
3332 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3333 : "=A" (u64)
3334 , "+m" (*pu64)
3335 : "0" (0ULL)
3336 , "b" (0)
3337 , "c" (0)
3338 : "cc");
3339# endif
3340# else
3341 Assert(!((uintptr_t)pu64 & 7));
3342 __asm
3343 {
3344 xor eax, eax
3345 xor edx, edx
3346 mov edi, pu64
3347 xor ecx, ecx
3348 xor ebx, ebx
3349 lock cmpxchg8b [edi]
3350 mov dword ptr [u64], eax
3351 mov dword ptr [u64 + 4], edx
3352 }
3353# endif
3354
3355# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3356 Assert(!((uintptr_t)pu64 & 7));
3357
3358# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3359 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3360 RTASM_ARM_DMB_SY
3361 "casa %[uDst], xzr, %[pMem]\n\t"
3362 : [uDst] "=&r" (u64)
3363 : [pMem] "Q" (*pu64),
3364 "0" (0)
3365 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3366# else
3367 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3368 RTASM_ARM_DMB_SY
3369# if defined(RT_ARCH_ARM64)
3370# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3371 "ldur %[uDst], %[pMem]\n\t"
3372# else
3373 "ldxr %[uDst], %[pMem]\n\t"
3374 "clrex\n\t"
3375# endif
3376# else
3377 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3378 /** @todo clrex */
3379# endif
3380 : [uDst] "=&r" (u64)
3381 : [pMem] "Q" (*pu64)
3382 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3383# endif
3384# else
3385# error "Port me"
3386# endif
3387 return u64;
3388}
3389#endif
3390
3391
3392/**
3393 * Atomically reads an unsigned 64-bit value, unordered.
3394 *
3395 * @returns Current *pu64 value
3396 * @param pu64 Pointer to the 64-bit variable to read.
3397 * The memory pointed to must be writable.
3398 *
3399 * @remarks This may fault if the memory is read-only!
3400 * @remarks x86: Requires a Pentium or later.
3401 */
3402#if !defined(RT_ARCH_AMD64) \
3403 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3404 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3405RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3406#else
3407DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3408{
3409 uint64_t u64;
3410# ifdef RT_ARCH_AMD64
3411 Assert(!((uintptr_t)pu64 & 7));
3412/*# if RT_INLINE_ASM_GNU_STYLE
3413 Assert(!((uintptr_t)pu64 & 7));
3414 __asm__ __volatile__("movq %1, %0\n\t"
3415 : "=r" (u64)
3416 : "m" (*pu64));
3417# else
3418 __asm
3419 {
3420 mov rdx, [pu64]
3421 mov rax, [rdx]
3422 mov [u64], rax
3423 }
3424# endif */
3425 u64 = *pu64;
3426
3427# elif defined(RT_ARCH_X86)
3428# if RT_INLINE_ASM_GNU_STYLE
3429# if defined(PIC) || defined(__PIC__)
3430 uint32_t u32EBX = 0;
3431 uint32_t u32Spill;
3432 Assert(!((uintptr_t)pu64 & 7));
3433 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3434 "xor %%ecx,%%ecx\n\t"
3435 "xor %%edx,%%edx\n\t"
3436 "xchgl %%ebx, %3\n\t"
3437 "lock; cmpxchg8b (%4)\n\t"
3438 "movl %3, %%ebx\n\t"
3439 : "=A" (u64)
3440# if RT_GNUC_PREREQ(4, 3)
3441 , "+m" (*pu64)
3442# else
3443 , "=m" (*pu64)
3444# endif
3445 , "=c" (u32Spill)
3446 : "m" (u32EBX)
3447 , "S" (pu64)
3448 : "cc");
3449# else /* !PIC */
3450 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3451 : "=A" (u64)
3452 , "+m" (*pu64)
3453 : "0" (0ULL)
3454 , "b" (0)
3455 , "c" (0)
3456 : "cc");
3457# endif
3458# else
3459 Assert(!((uintptr_t)pu64 & 7));
3460 __asm
3461 {
3462 xor eax, eax
3463 xor edx, edx
3464 mov edi, pu64
3465 xor ecx, ecx
3466 xor ebx, ebx
3467 lock cmpxchg8b [edi]
3468 mov dword ptr [u64], eax
3469 mov dword ptr [u64 + 4], edx
3470 }
3471# endif
3472
3473# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3474 Assert(!((uintptr_t)pu64 & 7));
3475 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3476# if defined(RT_ARCH_ARM64)
3477 "ldur %[uDst], %[pMem]\n\t"
3478# else
3479 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3480 /** @todo clrex? */
3481# endif
3482 : [uDst] "=&r" (u64)
3483 : [pMem] "Q" (*pu64));
3484
3485# else
3486# error "Port me"
3487# endif
3488 return u64;
3489}
3490#endif
3491
3492
3493/**
3494 * Atomically reads a signed 64-bit value, ordered.
3495 *
3496 * @returns Current *pi64 value
3497 * @param pi64 Pointer to the 64-bit variable to read.
3498 * The memory pointed to must be writable.
3499 *
3500 * @remarks This may fault if the memory is read-only!
3501 * @remarks x86: Requires a Pentium or later.
3502 */
3503DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3504{
3505 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3506}
3507
3508
3509/**
3510 * Atomically reads a signed 64-bit value, unordered.
3511 *
3512 * @returns Current *pi64 value
3513 * @param pi64 Pointer to the 64-bit variable to read.
3514 * The memory pointed to must be writable.
3515 *
3516 * @remarks This will fault if the memory is read-only!
3517 * @remarks x86: Requires a Pentium or later.
3518 */
3519DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3520{
3521 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3522}
3523
3524
3525/**
3526 * Atomically reads a size_t value, ordered.
3527 *
3528 * @returns Current *pcb value
3529 * @param pcb Pointer to the size_t variable to read.
3530 */
3531DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3532{
3533#if ARCH_BITS == 64
3534 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3535#elif ARCH_BITS == 32
3536 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3537#elif ARCH_BITS == 16
3538 AssertCompileSize(size_t, 2);
3539 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3540#else
3541# error "Unsupported ARCH_BITS value"
3542#endif
3543}
3544
3545
3546/**
3547 * Atomically reads a size_t value, unordered.
3548 *
3549 * @returns Current *pcb value
3550 * @param pcb Pointer to the size_t variable to read.
3551 */
3552DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3553{
3554#if ARCH_BITS == 64 || ARCH_BITS == 16
3555 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3556#elif ARCH_BITS == 32
3557 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3558#elif ARCH_BITS == 16
3559 AssertCompileSize(size_t, 2);
3560 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3561#else
3562# error "Unsupported ARCH_BITS value"
3563#endif
3564}
3565
3566
3567/**
3568 * Atomically reads a pointer value, ordered.
3569 *
3570 * @returns Current *pv value
3571 * @param ppv Pointer to the pointer variable to read.
3572 *
3573 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3574 * requires less typing (no casts).
3575 */
3576DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3577{
3578#if ARCH_BITS == 32 || ARCH_BITS == 16
3579 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3580#elif ARCH_BITS == 64
3581 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3582#else
3583# error "ARCH_BITS is bogus"
3584#endif
3585}
3586
3587/**
3588 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3589 *
3590 * @returns Current *pv value
3591 * @param ppv Pointer to the pointer variable to read.
3592 * @param Type The type of *ppv, sans volatile.
3593 */
3594#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3595# define ASMAtomicReadPtrT(ppv, Type) \
3596 __extension__ \
3597 ({\
3598 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
3599 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
3600 pvTypeChecked; \
3601 })
3602#else
3603# define ASMAtomicReadPtrT(ppv, Type) \
3604 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3605#endif
3606
3607
3608/**
3609 * Atomically reads a pointer value, unordered.
3610 *
3611 * @returns Current *pv value
3612 * @param ppv Pointer to the pointer variable to read.
3613 *
3614 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
3615 * requires less typing (no casts).
3616 */
3617DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3618{
3619#if ARCH_BITS == 32 || ARCH_BITS == 16
3620 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3621#elif ARCH_BITS == 64
3622 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3623#else
3624# error "ARCH_BITS is bogus"
3625#endif
3626}
3627
3628
3629/**
3630 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
3631 *
3632 * @returns Current *pv value
3633 * @param ppv Pointer to the pointer variable to read.
3634 * @param Type The type of *ppv, sans volatile.
3635 */
3636#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3637# define ASMAtomicUoReadPtrT(ppv, Type) \
3638 __extension__ \
3639 ({\
3640 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
3641 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
3642 pvTypeChecked; \
3643 })
3644#else
3645# define ASMAtomicUoReadPtrT(ppv, Type) \
3646 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3647#endif
3648
3649
3650/**
3651 * Atomically reads a boolean value, ordered.
3652 *
3653 * @returns Current *pf value
3654 * @param pf Pointer to the boolean variable to read.
3655 */
3656DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3657{
3658 ASMMemoryFence();
3659 return *pf; /* byte reads are atomic on x86 */
3660}
3661
3662
3663/**
3664 * Atomically reads a boolean value, unordered.
3665 *
3666 * @returns Current *pf value
3667 * @param pf Pointer to the boolean variable to read.
3668 */
3669DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3670{
3671 return *pf; /* byte reads are atomic on x86 */
3672}
3673
3674
3675/**
3676 * Atomically read a typical IPRT handle value, ordered.
3677 *
3678 * @param ph Pointer to the handle variable to read.
3679 * @param phRes Where to store the result.
3680 *
3681 * @remarks This doesn't currently work for all handles (like RTFILE).
3682 */
3683#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3684# define ASMAtomicReadHandle(ph, phRes) \
3685 do { \
3686 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3687 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3688 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
3689 } while (0)
3690#elif HC_ARCH_BITS == 64
3691# define ASMAtomicReadHandle(ph, phRes) \
3692 do { \
3693 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3694 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3695 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
3696 } while (0)
3697#else
3698# error HC_ARCH_BITS
3699#endif
3700
3701
3702/**
3703 * Atomically read a typical IPRT handle value, unordered.
3704 *
3705 * @param ph Pointer to the handle variable to read.
3706 * @param phRes Where to store the result.
3707 *
3708 * @remarks This doesn't currently work for all handles (like RTFILE).
3709 */
3710#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3711# define ASMAtomicUoReadHandle(ph, phRes) \
3712 do { \
3713 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3714 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3715 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
3716 } while (0)
3717#elif HC_ARCH_BITS == 64
3718# define ASMAtomicUoReadHandle(ph, phRes) \
3719 do { \
3720 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3721 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3722 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
3723 } while (0)
3724#else
3725# error HC_ARCH_BITS
3726#endif
3727
3728
3729/**
3730 * Atomically read a value which size might differ
3731 * between platforms or compilers, ordered.
3732 *
3733 * @param pu Pointer to the variable to read.
3734 * @param puRes Where to store the result.
3735 */
3736#define ASMAtomicReadSize(pu, puRes) \
3737 do { \
3738 switch (sizeof(*(pu))) { \
3739 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3740 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3741 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3742 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3743 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3744 } \
3745 } while (0)
3746
3747
3748/**
3749 * Atomically read a value which size might differ
3750 * between platforms or compilers, unordered.
3751 *
3752 * @param pu Pointer to the variable to read.
3753 * @param puRes Where to store the result.
3754 */
3755#define ASMAtomicUoReadSize(pu, puRes) \
3756 do { \
3757 switch (sizeof(*(pu))) { \
3758 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3759 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3760 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3761 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3762 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3763 } \
3764 } while (0)
3765
3766
3767/**
3768 * Atomically writes an unsigned 8-bit value, ordered.
3769 *
3770 * @param pu8 Pointer to the 8-bit variable.
3771 * @param u8 The 8-bit value to assign to *pu8.
3772 */
3773DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3774{
3775#if defined(RT_ARCH_ARM64)
3776 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
3777 as all byte accesses are single-copy atomic, which I think suffices here. */
3778 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
3779# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
3780 RTASM_ARM_DMB_SY
3781 "swpb %w[uValue], wzr, %[pMem]\n\t"
3782# else
3783 RTASM_ARM_DMB_SY
3784 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3785# endif
3786 : [pMem] "+Q" (*pu8)
3787 : [uValue] "r" ((uint32_t)u8)
3788 : );
3789#else
3790 ASMAtomicXchgU8(pu8, u8);
3791#endif
3792}
3793
3794
3795/**
3796 * Atomically writes an unsigned 8-bit value, unordered.
3797 *
3798 * @param pu8 Pointer to the 8-bit variable.
3799 * @param u8 The 8-bit value to assign to *pu8.
3800 */
3801DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3802{
3803 *pu8 = u8; /* byte writes are atomic on x86 */
3804}
3805
3806
3807/**
3808 * Atomically writes a signed 8-bit value, ordered.
3809 *
3810 * @param pi8 Pointer to the 8-bit variable to read.
3811 * @param i8 The 8-bit value to assign to *pi8.
3812 */
3813DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3814{
3815#if defined(RT_ARCH_ARM64)
3816 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
3817#else
3818 ASMAtomicXchgS8(pi8, i8);
3819#endif
3820}
3821
3822
3823/**
3824 * Atomically writes a signed 8-bit value, unordered.
3825 *
3826 * @param pi8 Pointer to the 8-bit variable to write.
3827 * @param i8 The 8-bit value to assign to *pi8.
3828 */
3829DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3830{
3831 *pi8 = i8; /* byte writes are atomic on x86 */
3832}
3833
3834
3835/**
3836 * Atomically writes an unsigned 16-bit value, ordered.
3837 *
3838 * @param pu16 Pointer to the 16-bit variable to write.
3839 * @param u16 The 16-bit value to assign to *pu16.
3840 */
3841DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3842{
3843#if defined(RT_ARCH_ARM64)
3844 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
3845# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3846 RTASM_ARM_DMB_SY
3847 "swph %w[uValue], wzr, %[pMem]\n\t"
3848# else
3849 RTASM_ARM_DMB_SY
3850 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3851# endif
3852 : [pMem] "+Q" (*pu16)
3853 : [uValue] "r" ((uint32_t)u16)
3854 : );
3855#else
3856 ASMAtomicXchgU16(pu16, u16);
3857#endif
3858}
3859
3860
3861/**
3862 * Atomically writes an unsigned 16-bit value, unordered.
3863 *
3864 * @param pu16 Pointer to the 16-bit variable to write.
3865 * @param u16 The 16-bit value to assign to *pu16.
3866 */
3867DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3868{
3869 Assert(!((uintptr_t)pu16 & 1));
3870 *pu16 = u16;
3871}
3872
3873
3874/**
3875 * Atomically writes a signed 16-bit value, ordered.
3876 *
3877 * @param pi16 Pointer to the 16-bit variable to write.
3878 * @param i16 The 16-bit value to assign to *pi16.
3879 */
3880DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3881{
3882#if defined(RT_ARCH_ARM64)
3883 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
3884#else
3885 ASMAtomicXchgS16(pi16, i16);
3886#endif
3887}
3888
3889
3890/**
3891 * Atomically writes a signed 16-bit value, unordered.
3892 *
3893 * @param pi16 Pointer to the 16-bit variable to write.
3894 * @param i16 The 16-bit value to assign to *pi16.
3895 */
3896DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
3897{
3898 Assert(!((uintptr_t)pi16 & 1));
3899 *pi16 = i16;
3900}
3901
3902
3903/**
3904 * Atomically writes an unsigned 32-bit value, ordered.
3905 *
3906 * @param pu32 Pointer to the 32-bit variable to write.
3907 * @param u32 The 32-bit value to assign to *pu32.
3908 */
3909DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3910{
3911#if defined(RT_ARCH_ARM64)
3912 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
3913# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3914 RTASM_ARM_DMB_SY
3915 "swp %w[uValue], wzr, %[pMem]\n\t"
3916# else
3917 RTASM_ARM_DMB_SY
3918 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3919# endif
3920 : [pMem] "+Q" (*pu32)
3921 : [uValue] "r" (u32)
3922 : "cc");
3923#else
3924 ASMAtomicXchgU32(pu32, u32);
3925#endif
3926}
3927
3928
3929/**
3930 * Atomically writes an unsigned 32-bit value, unordered.
3931 *
3932 * @param pu32 Pointer to the 32-bit variable to write.
3933 * @param u32 The 32-bit value to assign to *pu32.
3934 */
3935DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
3936{
3937 Assert(!((uintptr_t)pu32 & 3));
3938#if ARCH_BITS >= 32
3939 *pu32 = u32;
3940#else
3941 ASMAtomicXchgU32(pu32, u32);
3942#endif
3943}
3944
3945
3946/**
3947 * Atomically writes a signed 32-bit value, ordered.
3948 *
3949 * @param pi32 Pointer to the 32-bit variable to write.
3950 * @param i32 The 32-bit value to assign to *pi32.
3951 */
3952DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3953{
3954#if defined(RT_ARCH_ARM64)
3955 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
3956#else
3957 ASMAtomicXchgS32(pi32, i32);
3958#endif
3959}
3960
3961
3962/**
3963 * Atomically writes a signed 32-bit value, unordered.
3964 *
3965 * @param pi32 Pointer to the 32-bit variable to write.
3966 * @param i32 The 32-bit value to assign to *pi32.
3967 */
3968DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
3969{
3970 Assert(!((uintptr_t)pi32 & 3));
3971#if ARCH_BITS >= 32
3972 *pi32 = i32;
3973#else
3974 ASMAtomicXchgS32(pi32, i32);
3975#endif
3976}
3977
3978
3979/**
3980 * Atomically writes an unsigned 64-bit value, ordered.
3981 *
3982 * @param pu64 Pointer to the 64-bit variable to write.
3983 * @param u64 The 64-bit value to assign to *pu64.
3984 */
3985DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
3986{
3987#if defined(RT_ARCH_ARM64)
3988 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
3989# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3990 RTASM_ARM_DMB_SY
3991 "swp %[uValue], xzr, %[pMem]\n\t"
3992# else
3993 RTASM_ARM_DMB_SY /** @todo necessary? */
3994 "stlr %[uValue], %[pMem]\n\t"
3995# endif
3996 : [pMem] "+Q" (*pu64)
3997 : [uValue] "r" (u64)
3998 : );
3999#else
4000 ASMAtomicXchgU64(pu64, u64);
4001#endif
4002}
4003
4004
4005/**
4006 * Atomically writes an unsigned 64-bit value, unordered.
4007 *
4008 * @param pu64 Pointer to the 64-bit variable to write.
4009 * @param u64 The 64-bit value to assign to *pu64.
4010 */
4011DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4012{
4013 Assert(!((uintptr_t)pu64 & 7));
4014#if ARCH_BITS == 64
4015 *pu64 = u64;
4016#else
4017 ASMAtomicXchgU64(pu64, u64);
4018#endif
4019}
4020
4021
4022/**
4023 * Atomically writes a signed 64-bit value, ordered.
4024 *
4025 * @param pi64 Pointer to the 64-bit variable to write.
4026 * @param i64 The 64-bit value to assign to *pi64.
4027 */
4028DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4029{
4030#if defined(RT_ARCH_ARM64)
4031 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4032#else
4033 ASMAtomicXchgS64(pi64, i64);
4034#endif
4035}
4036
4037
4038/**
4039 * Atomically writes a signed 64-bit value, unordered.
4040 *
4041 * @param pi64 Pointer to the 64-bit variable to write.
4042 * @param i64 The 64-bit value to assign to *pi64.
4043 */
4044DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4045{
4046 Assert(!((uintptr_t)pi64 & 7));
4047#if ARCH_BITS == 64
4048 *pi64 = i64;
4049#else
4050 ASMAtomicXchgS64(pi64, i64);
4051#endif
4052}
4053
4054
4055/**
4056 * Atomically writes a size_t value, ordered.
4057 *
4058 * @param pcb Pointer to the size_t variable to write.
4059 * @param cb The value to assign to *pcb.
4060 */
4061DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4062{
4063#if ARCH_BITS == 64
4064 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4065#elif ARCH_BITS == 32
4066 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4067#elif ARCH_BITS == 16
4068 AssertCompileSize(size_t, 2);
4069 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4070#else
4071# error "Unsupported ARCH_BITS value"
4072#endif
4073}
4074
4075
4076/**
4077 * Atomically writes a size_t value, unordered.
4078 *
4079 * @param pcb Pointer to the size_t variable to write.
4080 * @param cb The value to assign to *pcb.
4081 */
4082DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4083{
4084#if ARCH_BITS == 64
4085 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4086#elif ARCH_BITS == 32
4087 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4088#elif ARCH_BITS == 16
4089 AssertCompileSize(size_t, 2);
4090 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4091#else
4092# error "Unsupported ARCH_BITS value"
4093#endif
4094}
4095
4096
4097/**
4098 * Atomically writes a boolean value, unordered.
4099 *
4100 * @param pf Pointer to the boolean variable to write.
4101 * @param f The boolean value to assign to *pf.
4102 */
4103DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4104{
4105 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4106}
4107
4108
4109/**
4110 * Atomically writes a boolean value, unordered.
4111 *
4112 * @param pf Pointer to the boolean variable to write.
4113 * @param f The boolean value to assign to *pf.
4114 */
4115DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4116{
4117 *pf = f; /* byte writes are atomic on x86 */
4118}
4119
4120
4121/**
4122 * Atomically writes a pointer value, ordered.
4123 *
4124 * @param ppv Pointer to the pointer variable to write.
4125 * @param pv The pointer value to assign to *ppv.
4126 */
4127DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4128{
4129#if ARCH_BITS == 32 || ARCH_BITS == 16
4130 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4131#elif ARCH_BITS == 64
4132 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4133#else
4134# error "ARCH_BITS is bogus"
4135#endif
4136}
4137
4138
4139/**
4140 * Atomically writes a pointer value, unordered.
4141 *
4142 * @param ppv Pointer to the pointer variable to write.
4143 * @param pv The pointer value to assign to *ppv.
4144 */
4145DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4146{
4147#if ARCH_BITS == 32 || ARCH_BITS == 16
4148 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4149#elif ARCH_BITS == 64
4150 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4151#else
4152# error "ARCH_BITS is bogus"
4153#endif
4154}
4155
4156
4157/**
4158 * Atomically writes a pointer value, ordered.
4159 *
4160 * @param ppv Pointer to the pointer variable to write.
4161 * @param pv The pointer value to assign to *ppv. If NULL use
4162 * ASMAtomicWriteNullPtr or you'll land in trouble.
4163 *
4164 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4165 * NULL.
4166 */
4167#ifdef __GNUC__
4168# define ASMAtomicWritePtr(ppv, pv) \
4169 do \
4170 { \
4171 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4172 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4173 \
4174 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4175 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4176 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4177 \
4178 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4179 } while (0)
4180#else
4181# define ASMAtomicWritePtr(ppv, pv) \
4182 do \
4183 { \
4184 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4185 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4186 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4187 \
4188 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4189 } while (0)
4190#endif
4191
4192
4193/**
4194 * Atomically sets a pointer to NULL, ordered.
4195 *
4196 * @param ppv Pointer to the pointer variable that should be set to NULL.
4197 *
4198 * @remarks This is relatively type safe on GCC platforms.
4199 */
4200#if RT_GNUC_PREREQ(4, 2)
4201# define ASMAtomicWriteNullPtr(ppv) \
4202 do \
4203 { \
4204 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4205 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4206 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4207 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4208 } while (0)
4209#else
4210# define ASMAtomicWriteNullPtr(ppv) \
4211 do \
4212 { \
4213 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4214 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4215 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4216 } while (0)
4217#endif
4218
4219
4220/**
4221 * Atomically writes a pointer value, unordered.
4222 *
4223 * @returns Current *pv value
4224 * @param ppv Pointer to the pointer variable.
4225 * @param pv The pointer value to assign to *ppv. If NULL use
4226 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4227 *
4228 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4229 * NULL.
4230 */
4231#if RT_GNUC_PREREQ(4, 2)
4232# define ASMAtomicUoWritePtr(ppv, pv) \
4233 do \
4234 { \
4235 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4236 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4237 \
4238 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4239 AssertCompile(sizeof(pv) == sizeof(void *)); \
4240 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4241 \
4242 *(ppvTypeChecked) = pvTypeChecked; \
4243 } while (0)
4244#else
4245# define ASMAtomicUoWritePtr(ppv, pv) \
4246 do \
4247 { \
4248 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4249 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4250 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4251 *(ppv) = pv; \
4252 } while (0)
4253#endif
4254
4255
4256/**
4257 * Atomically sets a pointer to NULL, unordered.
4258 *
4259 * @param ppv Pointer to the pointer variable that should be set to NULL.
4260 *
4261 * @remarks This is relatively type safe on GCC platforms.
4262 */
4263#ifdef __GNUC__
4264# define ASMAtomicUoWriteNullPtr(ppv) \
4265 do \
4266 { \
4267 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4268 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4269 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4270 *(ppvTypeChecked) = NULL; \
4271 } while (0)
4272#else
4273# define ASMAtomicUoWriteNullPtr(ppv) \
4274 do \
4275 { \
4276 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4277 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4278 *(ppv) = NULL; \
4279 } while (0)
4280#endif
4281
4282
4283/**
4284 * Atomically write a typical IPRT handle value, ordered.
4285 *
4286 * @param ph Pointer to the variable to update.
4287 * @param hNew The value to assign to *ph.
4288 *
4289 * @remarks This doesn't currently work for all handles (like RTFILE).
4290 */
4291#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4292# define ASMAtomicWriteHandle(ph, hNew) \
4293 do { \
4294 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4295 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4296 } while (0)
4297#elif HC_ARCH_BITS == 64
4298# define ASMAtomicWriteHandle(ph, hNew) \
4299 do { \
4300 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4301 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4302 } while (0)
4303#else
4304# error HC_ARCH_BITS
4305#endif
4306
4307
4308/**
4309 * Atomically write a typical IPRT handle value, unordered.
4310 *
4311 * @param ph Pointer to the variable to update.
4312 * @param hNew The value to assign to *ph.
4313 *
4314 * @remarks This doesn't currently work for all handles (like RTFILE).
4315 */
4316#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4317# define ASMAtomicUoWriteHandle(ph, hNew) \
4318 do { \
4319 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4320 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4321 } while (0)
4322#elif HC_ARCH_BITS == 64
4323# define ASMAtomicUoWriteHandle(ph, hNew) \
4324 do { \
4325 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4326 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4327 } while (0)
4328#else
4329# error HC_ARCH_BITS
4330#endif
4331
4332
4333/**
4334 * Atomically write a value which size might differ
4335 * between platforms or compilers, ordered.
4336 *
4337 * @param pu Pointer to the variable to update.
4338 * @param uNew The value to assign to *pu.
4339 */
4340#define ASMAtomicWriteSize(pu, uNew) \
4341 do { \
4342 switch (sizeof(*(pu))) { \
4343 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4344 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4345 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4346 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4347 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4348 } \
4349 } while (0)
4350
4351/**
4352 * Atomically write a value which size might differ
4353 * between platforms or compilers, unordered.
4354 *
4355 * @param pu Pointer to the variable to update.
4356 * @param uNew The value to assign to *pu.
4357 */
4358#define ASMAtomicUoWriteSize(pu, uNew) \
4359 do { \
4360 switch (sizeof(*(pu))) { \
4361 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4362 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4363 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4364 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4365 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4366 } \
4367 } while (0)
4368
4369
4370
4371/**
4372 * Atomically exchanges and adds to a 16-bit value, ordered.
4373 *
4374 * @returns The old value.
4375 * @param pu16 Pointer to the value.
4376 * @param u16 Number to add.
4377 *
4378 * @remarks Currently not implemented, just to make 16-bit code happy.
4379 * @remarks x86: Requires a 486 or later.
4380 */
4381RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4382
4383
4384/**
4385 * Atomically exchanges and adds to a 32-bit value, ordered.
4386 *
4387 * @returns The old value.
4388 * @param pu32 Pointer to the value.
4389 * @param u32 Number to add.
4390 *
4391 * @remarks x86: Requires a 486 or later.
4392 */
4393#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4394RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4395#else
4396DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4397{
4398# if RT_INLINE_ASM_USES_INTRIN
4399 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4400 return u32;
4401
4402# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4403# if RT_INLINE_ASM_GNU_STYLE
4404 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4405 : "=r" (u32)
4406 , "=m" (*pu32)
4407 : "0" (u32)
4408 , "m" (*pu32)
4409 : "memory"
4410 , "cc");
4411 return u32;
4412# else
4413 __asm
4414 {
4415 mov eax, [u32]
4416# ifdef RT_ARCH_AMD64
4417 mov rdx, [pu32]
4418 lock xadd [rdx], eax
4419# else
4420 mov edx, [pu32]
4421 lock xadd [edx], eax
4422# endif
4423 mov [u32], eax
4424 }
4425 return u32;
4426# endif
4427
4428# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4429 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
4430# if defined(RTASM_ARM64_USE_FEAT_LSE)
4431 uint32_t u32OldRet;
4432 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
4433# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4434 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4435# else
4436 RTASM_ARM_DMB_SY
4437 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4438# endif
4439 : [pMem] "+Q" (*pu32)
4440 , [uOldActual] "=&r" (u32OldRet)
4441 : [uAddend] "r" (u32)
4442 : );
4443# else
4444 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
4445 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
4446 "add %[uNew], %[uOld], %[uVal]\n\t",
4447 [uVal] "r" (u32));
4448# endif
4449 return u32OldRet;
4450
4451# else
4452# error "Port me"
4453# endif
4454}
4455#endif
4456
4457
4458/**
4459 * Atomically exchanges and adds to a signed 32-bit value, ordered.
4460 *
4461 * @returns The old value.
4462 * @param pi32 Pointer to the value.
4463 * @param i32 Number to add.
4464 *
4465 * @remarks x86: Requires a 486 or later.
4466 */
4467DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4468{
4469 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
4470}
4471
4472
4473/**
4474 * Atomically exchanges and adds to a 64-bit value, ordered.
4475 *
4476 * @returns The old value.
4477 * @param pu64 Pointer to the value.
4478 * @param u64 Number to add.
4479 *
4480 * @remarks x86: Requires a Pentium or later.
4481 */
4482#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4483DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
4484#else
4485DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4486{
4487# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4488 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
4489 return u64;
4490
4491# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4492 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4493 : "=r" (u64)
4494 , "=m" (*pu64)
4495 : "0" (u64)
4496 , "m" (*pu64)
4497 : "memory"
4498 , "cc");
4499 return u64;
4500
4501# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4502# if defined(RTASM_ARM64_USE_FEAT_LSE)
4503 uint64_t u64OldRet;
4504 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
4505# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4506 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
4507# else
4508 RTASM_ARM_DMB_SY
4509 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
4510# endif
4511 : [pMem] "+Q" (*pu64)
4512 , [uOldActual] "=&r" (u64OldRet)
4513 : [uAddend] "r" (u64)
4514 : );
4515# else
4516 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
4517 "add %[uNew], %[uOld], %[uVal]\n\t"
4518 ,
4519 "add %[uNew], %[uOld], %[uVal]\n\t"
4520 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
4521 [uVal] "r" (u64));
4522# endif
4523 return u64OldRet;
4524
4525# else
4526 uint64_t u64Old;
4527 for (;;)
4528 {
4529 uint64_t u64New;
4530 u64Old = ASMAtomicUoReadU64(pu64);
4531 u64New = u64Old + u64;
4532 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
4533 break;
4534 ASMNopPause();
4535 }
4536 return u64Old;
4537# endif
4538}
4539#endif
4540
4541
4542/**
4543 * Atomically exchanges and adds to a signed 64-bit value, ordered.
4544 *
4545 * @returns The old value.
4546 * @param pi64 Pointer to the value.
4547 * @param i64 Number to add.
4548 *
4549 * @remarks x86: Requires a Pentium or later.
4550 */
4551DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4552{
4553 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
4554}
4555
4556
4557/**
4558 * Atomically exchanges and adds to a size_t value, ordered.
4559 *
4560 * @returns The old value.
4561 * @param pcb Pointer to the size_t value.
4562 * @param cb Number to add.
4563 */
4564DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4565{
4566#if ARCH_BITS == 64
4567 AssertCompileSize(size_t, 8);
4568 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
4569#elif ARCH_BITS == 32
4570 AssertCompileSize(size_t, 4);
4571 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
4572#elif ARCH_BITS == 16
4573 AssertCompileSize(size_t, 2);
4574 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
4575#else
4576# error "Unsupported ARCH_BITS value"
4577#endif
4578}
4579
4580
4581/**
4582 * Atomically exchanges and adds a value which size might differ between
4583 * platforms or compilers, ordered.
4584 *
4585 * @param pu Pointer to the variable to update.
4586 * @param uNew The value to add to *pu.
4587 * @param puOld Where to store the old value.
4588 */
4589#define ASMAtomicAddSize(pu, uNew, puOld) \
4590 do { \
4591 switch (sizeof(*(pu))) { \
4592 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4593 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4594 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
4595 } \
4596 } while (0)
4597
4598
4599
4600/**
4601 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
4602 *
4603 * @returns The old value.
4604 * @param pu16 Pointer to the value.
4605 * @param u16 Number to subtract.
4606 *
4607 * @remarks x86: Requires a 486 or later.
4608 */
4609DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
4610{
4611 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
4612}
4613
4614
4615/**
4616 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
4617 *
4618 * @returns The old value.
4619 * @param pi16 Pointer to the value.
4620 * @param i16 Number to subtract.
4621 *
4622 * @remarks x86: Requires a 486 or later.
4623 */
4624DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4625{
4626 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
4627}
4628
4629
4630/**
4631 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
4632 *
4633 * @returns The old value.
4634 * @param pu32 Pointer to the value.
4635 * @param u32 Number to subtract.
4636 *
4637 * @remarks x86: Requires a 486 or later.
4638 */
4639DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4640{
4641 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
4642}
4643
4644
4645/**
4646 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
4647 *
4648 * @returns The old value.
4649 * @param pi32 Pointer to the value.
4650 * @param i32 Number to subtract.
4651 *
4652 * @remarks x86: Requires a 486 or later.
4653 */
4654DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4655{
4656 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
4657}
4658
4659
4660/**
4661 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
4662 *
4663 * @returns The old value.
4664 * @param pu64 Pointer to the value.
4665 * @param u64 Number to subtract.
4666 *
4667 * @remarks x86: Requires a Pentium or later.
4668 */
4669DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4670{
4671 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
4672}
4673
4674
4675/**
4676 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
4677 *
4678 * @returns The old value.
4679 * @param pi64 Pointer to the value.
4680 * @param i64 Number to subtract.
4681 *
4682 * @remarks x86: Requires a Pentium or later.
4683 */
4684DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4685{
4686 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
4687}
4688
4689
4690/**
4691 * Atomically exchanges and subtracts to a size_t value, ordered.
4692 *
4693 * @returns The old value.
4694 * @param pcb Pointer to the size_t value.
4695 * @param cb Number to subtract.
4696 *
4697 * @remarks x86: Requires a 486 or later.
4698 */
4699DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4700{
4701#if ARCH_BITS == 64
4702 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
4703#elif ARCH_BITS == 32
4704 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
4705#elif ARCH_BITS == 16
4706 AssertCompileSize(size_t, 2);
4707 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
4708#else
4709# error "Unsupported ARCH_BITS value"
4710#endif
4711}
4712
4713
4714/**
4715 * Atomically exchanges and subtracts a value which size might differ between
4716 * platforms or compilers, ordered.
4717 *
4718 * @param pu Pointer to the variable to update.
4719 * @param uNew The value to subtract to *pu.
4720 * @param puOld Where to store the old value.
4721 *
4722 * @remarks x86: Requires a 486 or later.
4723 */
4724#define ASMAtomicSubSize(pu, uNew, puOld) \
4725 do { \
4726 switch (sizeof(*(pu))) { \
4727 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4728 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4729 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
4730 } \
4731 } while (0)
4732
4733
4734
4735/**
4736 * Atomically increment a 16-bit value, ordered.
4737 *
4738 * @returns The new value.
4739 * @param pu16 Pointer to the value to increment.
4740 * @remarks Not implemented. Just to make 16-bit code happy.
4741 *
4742 * @remarks x86: Requires a 486 or later.
4743 */
4744RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4745
4746
4747/**
4748 * Atomically increment a 32-bit value, ordered.
4749 *
4750 * @returns The new value.
4751 * @param pu32 Pointer to the value to increment.
4752 *
4753 * @remarks x86: Requires a 486 or later.
4754 */
4755#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4756RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4757#else
4758DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4759{
4760# if RT_INLINE_ASM_USES_INTRIN
4761 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
4762
4763# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4764# if RT_INLINE_ASM_GNU_STYLE
4765 uint32_t u32;
4766 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4767 : "=r" (u32)
4768 , "=m" (*pu32)
4769 : "0" (1)
4770 , "m" (*pu32)
4771 : "memory"
4772 , "cc");
4773 return u32+1;
4774# else
4775 __asm
4776 {
4777 mov eax, 1
4778# ifdef RT_ARCH_AMD64
4779 mov rdx, [pu32]
4780 lock xadd [rdx], eax
4781# else
4782 mov edx, [pu32]
4783 lock xadd [edx], eax
4784# endif
4785 mov u32, eax
4786 }
4787 return u32+1;
4788# endif
4789
4790# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4791 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
4792# if defined(RTASM_ARM64_USE_FEAT_LSE)
4793 uint32_t u32NewRet;
4794 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
4795# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4796 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4797# else
4798 RTASM_ARM_DMB_SY
4799 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4800# endif
4801 "add %w[uNewRet], %w[uNewRet], #1\n\t"
4802 : [pMem] "+Q" (*pu32)
4803 , [uNewRet] "=&r" (u32NewRet)
4804 : [uAddend] "r" ((uint32_t)1)
4805 : );
4806# else
4807 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
4808 "add %w[uNew], %w[uNew], #1\n\t",
4809 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
4810 "X" (0) /* dummy */);
4811# endif
4812 return u32NewRet;
4813
4814# else
4815 return ASMAtomicAddU32(pu32, 1) + 1;
4816# endif
4817}
4818#endif
4819
4820
4821/**
4822 * Atomically increment a signed 32-bit value, ordered.
4823 *
4824 * @returns The new value.
4825 * @param pi32 Pointer to the value to increment.
4826 *
4827 * @remarks x86: Requires a 486 or later.
4828 */
4829DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
4830{
4831 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
4832}
4833
4834
4835/**
4836 * Atomically increment a 64-bit value, ordered.
4837 *
4838 * @returns The new value.
4839 * @param pu64 Pointer to the value to increment.
4840 *
4841 * @remarks x86: Requires a Pentium or later.
4842 */
4843#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4844DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
4845#else
4846DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
4847{
4848# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4849 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
4850
4851# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4852 uint64_t u64;
4853 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4854 : "=r" (u64)
4855 , "=m" (*pu64)
4856 : "0" (1)
4857 , "m" (*pu64)
4858 : "memory"
4859 , "cc");
4860 return u64 + 1;
4861
4862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4863# if defined(RTASM_ARM64_USE_FEAT_LSE)
4864 uint64_t u64NewRet;
4865 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
4866# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4867 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
4868# else
4869 RTASM_ARM_DMB_SY
4870 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
4871# endif
4872 "add %[uNewRet], %[uNewRet], #1\n\t"
4873 : [pMem] "+Q" (*pu64)
4874 , [uNewRet] "=&r" (u64NewRet)
4875 : [uAddend] "r" ((uint64_t)1)
4876 : );
4877# else
4878 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
4879 "add %[uNew], %[uNew], #1\n\t"
4880 ,
4881 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
4882 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
4883 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
4884# endif
4885 return u64NewRet;
4886
4887# else
4888 return ASMAtomicAddU64(pu64, 1) + 1;
4889# endif
4890}
4891#endif
4892
4893
4894/**
4895 * Atomically increment a signed 64-bit value, ordered.
4896 *
4897 * @returns The new value.
4898 * @param pi64 Pointer to the value to increment.
4899 *
4900 * @remarks x86: Requires a Pentium or later.
4901 */
4902DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
4903{
4904 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
4905}
4906
4907
4908/**
4909 * Atomically increment a size_t value, ordered.
4910 *
4911 * @returns The new value.
4912 * @param pcb Pointer to the value to increment.
4913 *
4914 * @remarks x86: Requires a 486 or later.
4915 */
4916DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
4917{
4918#if ARCH_BITS == 64
4919 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
4920#elif ARCH_BITS == 32
4921 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
4922#elif ARCH_BITS == 16
4923 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
4924#else
4925# error "Unsupported ARCH_BITS value"
4926#endif
4927}
4928
4929
4930
4931/**
4932 * Atomically decrement an unsigned 32-bit value, ordered.
4933 *
4934 * @returns The new value.
4935 * @param pu16 Pointer to the value to decrement.
4936 * @remarks Not implemented. Just to make 16-bit code happy.
4937 *
4938 * @remarks x86: Requires a 486 or later.
4939 */
4940RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
4941
4942
4943/**
4944 * Atomically decrement an unsigned 32-bit value, ordered.
4945 *
4946 * @returns The new value.
4947 * @param pu32 Pointer to the value to decrement.
4948 *
4949 * @remarks x86: Requires a 486 or later.
4950 */
4951#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4952RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
4953#else
4954DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
4955{
4956# if RT_INLINE_ASM_USES_INTRIN
4957 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
4958
4959# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4960# if RT_INLINE_ASM_GNU_STYLE
4961 uint32_t u32;
4962 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4963 : "=r" (u32)
4964 , "=m" (*pu32)
4965 : "0" (-1)
4966 , "m" (*pu32)
4967 : "memory"
4968 , "cc");
4969 return u32-1;
4970# else
4971 uint32_t u32;
4972 __asm
4973 {
4974 mov eax, -1
4975# ifdef RT_ARCH_AMD64
4976 mov rdx, [pu32]
4977 lock xadd [rdx], eax
4978# else
4979 mov edx, [pu32]
4980 lock xadd [edx], eax
4981# endif
4982 mov u32, eax
4983 }
4984 return u32-1;
4985# endif
4986
4987# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4988 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
4989# if defined(RTASM_ARM64_USE_FEAT_LSE)
4990 uint32_t u32NewRet;
4991 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
4992# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4993 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4994# else
4995 RTASM_ARM_DMB_SY
4996 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
4997# endif
4998 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
4999 : [pMem] "+Q" (*pu32)
5000 , [uNewRet] "=&r" (u32NewRet)
5001 : [uAddend] "r" (~(uint32_t)0)
5002 : );
5003# else
5004 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5005 "sub %w[uNew], %w[uNew], #1\n\t",
5006 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5007 "X" (0) /* dummy */);
5008# endif
5009 return u32NewRet;
5010
5011# else
5012 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5013# endif
5014}
5015#endif
5016
5017
5018/**
5019 * Atomically decrement a signed 32-bit value, ordered.
5020 *
5021 * @returns The new value.
5022 * @param pi32 Pointer to the value to decrement.
5023 *
5024 * @remarks x86: Requires a 486 or later.
5025 */
5026DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5027{
5028 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5029}
5030
5031
5032/**
5033 * Atomically decrement an unsigned 64-bit value, ordered.
5034 *
5035 * @returns The new value.
5036 * @param pu64 Pointer to the value to decrement.
5037 *
5038 * @remarks x86: Requires a Pentium or later.
5039 */
5040#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5041RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5042#else
5043DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5044{
5045# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5046 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5047
5048# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5049 uint64_t u64;
5050 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5051 : "=r" (u64)
5052 , "=m" (*pu64)
5053 : "0" (~(uint64_t)0)
5054 , "m" (*pu64)
5055 : "memory"
5056 , "cc");
5057 return u64-1;
5058
5059# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5060# if defined(RTASM_ARM64_USE_FEAT_LSE)
5061 uint64_t u64NewRet;
5062 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5063# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5064 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5065# else
5066 RTASM_ARM_DMB_SY
5067 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5068# endif
5069 "sub %[uNewRet], %[uNewRet], #1\n\t"
5070 : [pMem] "+Q" (*pu64)
5071 , [uNewRet] "=&r" (u64NewRet)
5072 : [uAddend] "r" (~(uint64_t)0)
5073 : );
5074# else
5075 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5076 "sub %[uNew], %[uNew], #1\n\t"
5077 ,
5078 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5079 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5080 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5081# endif
5082 return u64NewRet;
5083
5084# else
5085 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5086# endif
5087}
5088#endif
5089
5090
5091/**
5092 * Atomically decrement a signed 64-bit value, ordered.
5093 *
5094 * @returns The new value.
5095 * @param pi64 Pointer to the value to decrement.
5096 *
5097 * @remarks x86: Requires a Pentium or later.
5098 */
5099DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5100{
5101 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5102}
5103
5104
5105/**
5106 * Atomically decrement a size_t value, ordered.
5107 *
5108 * @returns The new value.
5109 * @param pcb Pointer to the value to decrement.
5110 *
5111 * @remarks x86: Requires a 486 or later.
5112 */
5113DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5114{
5115#if ARCH_BITS == 64
5116 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5117#elif ARCH_BITS == 32
5118 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5119#elif ARCH_BITS == 16
5120 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5121#else
5122# error "Unsupported ARCH_BITS value"
5123#endif
5124}
5125
5126
5127/**
5128 * Atomically Or an unsigned 32-bit value, ordered.
5129 *
5130 * @param pu32 Pointer to the pointer variable to OR u32 with.
5131 * @param u32 The value to OR *pu32 with.
5132 *
5133 * @remarks x86: Requires a 386 or later.
5134 */
5135#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5136RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5137#else
5138DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5139{
5140# if RT_INLINE_ASM_USES_INTRIN
5141 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5142
5143# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5144# if RT_INLINE_ASM_GNU_STYLE
5145 __asm__ __volatile__("lock; orl %1, %0\n\t"
5146 : "=m" (*pu32)
5147 : "ir" (u32)
5148 , "m" (*pu32)
5149 : "cc");
5150# else
5151 __asm
5152 {
5153 mov eax, [u32]
5154# ifdef RT_ARCH_AMD64
5155 mov rdx, [pu32]
5156 lock or [rdx], eax
5157# else
5158 mov edx, [pu32]
5159 lock or [edx], eax
5160# endif
5161 }
5162# endif
5163
5164# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5165# if defined(RTASM_ARM64_USE_FEAT_LSE)
5166# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5167 uint32_t u32Spill;
5168 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5169 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5170 : [pMem] "+Q" (*pu32)
5171 , [uSpill] "=&r" (u32Spill)
5172 : [fBitsToSet] "r" (u32)
5173 : );
5174# else
5175 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5176 RTASM_ARM_DMB_SY
5177 "stset %w[fBitsToSet], %[pMem]\n\t"
5178 : [pMem] "+Q" (*pu32)
5179 : [fBitsToSet] "r" (u32)
5180 : );
5181# endif
5182# else
5183 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5184 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5185 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5186 "orr %[uNew], %[uNew], %[uVal]\n\t",
5187 [uVal] "r" (u32));
5188
5189# endif
5190# else
5191# error "Port me"
5192# endif
5193}
5194#endif
5195
5196
5197/**
5198 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5199 * fallback).
5200 *
5201 * @returns Old value.
5202 * @param pu32 Pointer to the variable to OR @a u32 with.
5203 * @param u32 The value to OR @a *pu32 with.
5204 */
5205DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5206{
5207#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5208# if defined(RTASM_ARM64_USE_FEAT_LSE)
5209 uint32_t u32OldRet;
5210 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5211# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5212 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5213# else
5214 RTASM_ARM_DMB_SY
5215 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5216# endif
5217 : [pMem] "+Q" (*pu32)
5218 , [uOldRet] "=&r" (u32OldRet)
5219 : [fBitsToSet] "r" (u32)
5220 : );
5221# else
5222 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5223 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5224 "orr %[uNew], %[uOld], %[uVal]\n\t",
5225 [uVal] "r" (u32));
5226# endif
5227 return u32OldRet;
5228
5229#else
5230 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5231 uint32_t u32New;
5232 do
5233 u32New = u32RetOld | u32;
5234 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5235 return u32RetOld;
5236#endif
5237}
5238
5239
5240/**
5241 * Atomically Or a signed 32-bit value, ordered.
5242 *
5243 * @param pi32 Pointer to the pointer variable to OR u32 with.
5244 * @param i32 The value to OR *pu32 with.
5245 *
5246 * @remarks x86: Requires a 386 or later.
5247 */
5248DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5249{
5250 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5251}
5252
5253
5254/**
5255 * Atomically Or an unsigned 64-bit value, ordered.
5256 *
5257 * @param pu64 Pointer to the pointer variable to OR u64 with.
5258 * @param u64 The value to OR *pu64 with.
5259 *
5260 * @remarks x86: Requires a Pentium or later.
5261 */
5262#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5263DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5264#else
5265DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5266{
5267# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5268 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5269
5270# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5271 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5272 : "=m" (*pu64)
5273 : "r" (u64)
5274 , "m" (*pu64)
5275 : "cc");
5276
5277# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5278# if defined(RTASM_ARM64_USE_FEAT_LSE)
5279# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5280 uint64_t u64Spill;
5281 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5282 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5283 : [pMem] "+Q" (*pu64)
5284 , [uSpill] "=&r" (u64Spill)
5285 : [fBitsToSet] "r" (u64)
5286 : );
5287# else
5288 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5289 RTASM_ARM_DMB_SY
5290 "stset %[fBitsToSet], %[pMem]\n\t"
5291 : [pMem] "+Q" (*pu64)
5292 : [fBitsToSet] "r" (u64)
5293 : );
5294# endif
5295# else
5296 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5297 "orr %[uNew], %[uNew], %[uVal]\n\t"
5298 ,
5299 "orr %[uNew], %[uNew], %[uVal]\n\t"
5300 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5301 [uVal] "r" (u64));
5302# endif
5303
5304# else
5305 for (;;)
5306 {
5307 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5308 uint64_t u64New = u64Old | u64;
5309 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5310 break;
5311 ASMNopPause();
5312 }
5313# endif
5314}
5315#endif
5316
5317
5318/**
5319 * Atomically Or a signed 64-bit value, ordered.
5320 *
5321 * @param pi64 Pointer to the pointer variable to OR u64 with.
5322 * @param i64 The value to OR *pu64 with.
5323 *
5324 * @remarks x86: Requires a Pentium or later.
5325 */
5326DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5327{
5328 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5329}
5330
5331
5332/**
5333 * Atomically And an unsigned 32-bit value, ordered.
5334 *
5335 * @param pu32 Pointer to the pointer variable to AND u32 with.
5336 * @param u32 The value to AND *pu32 with.
5337 *
5338 * @remarks x86: Requires a 386 or later.
5339 */
5340#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5341RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5342#else
5343DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5344{
5345# if RT_INLINE_ASM_USES_INTRIN
5346 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5347
5348# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5349# if RT_INLINE_ASM_GNU_STYLE
5350 __asm__ __volatile__("lock; andl %1, %0\n\t"
5351 : "=m" (*pu32)
5352 : "ir" (u32)
5353 , "m" (*pu32)
5354 : "cc");
5355# else
5356 __asm
5357 {
5358 mov eax, [u32]
5359# ifdef RT_ARCH_AMD64
5360 mov rdx, [pu32]
5361 lock and [rdx], eax
5362# else
5363 mov edx, [pu32]
5364 lock and [edx], eax
5365# endif
5366 }
5367# endif
5368
5369# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5370# if defined(RTASM_ARM64_USE_FEAT_LSE)
5371# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5372 uint32_t u32Spill;
5373 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5374 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5375 : [pMem] "+Q" (*pu32)
5376 , [uSpill] "=&r" (u32Spill)
5377 : [fBitsToClear] "r" (~u32)
5378 : );
5379# else
5380 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5381 RTASM_ARM_DMB_SY
5382 "stclr %w[fBitsToClear], %[pMem]\n\t"
5383 : [pMem] "+Q" (*pu32)
5384 : [fBitsToClear] "r" (~u32)
5385 : );
5386# endif
5387# else
5388 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5389 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5390 "and %[uNew], %[uNew], %[uVal]\n\t",
5391 [uVal] "r" (u32));
5392
5393# endif
5394# else
5395# error "Port me"
5396# endif
5397}
5398#endif
5399
5400
5401/**
5402 * Atomically AND an unsigned 32-bit value, ordered, extended version.
5403 *
5404 * @returns Old value.
5405 * @param pu32 Pointer to the variable to AND @a u32 with.
5406 * @param u32 The value to AND @a *pu32 with.
5407 */
5408DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5409{
5410#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5411# if defined(RTASM_ARM64_USE_FEAT_LSE)
5412 uint32_t u32OldRet;
5413 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5414# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5415 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5416# else
5417 RTASM_ARM_DMB_SY
5418 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5419# endif
5420 : [pMem] "+Q" (*pu32)
5421 , [uOldRet] "=&r" (u32OldRet)
5422 : [fBitsToClear] "r" (~u32)
5423 : );
5424# else
5425 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
5426 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5427 "and %[uNew], %[uOld], %[uVal]\n\t",
5428 [uVal] "r" (u32));
5429# endif
5430 return u32OldRet;
5431
5432#else
5433 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5434 uint32_t u32New;
5435 do
5436 u32New = u32RetOld & u32;
5437 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5438 return u32RetOld;
5439#endif
5440}
5441
5442
5443/**
5444 * Atomically And a signed 32-bit value, ordered.
5445 *
5446 * @param pi32 Pointer to the pointer variable to AND i32 with.
5447 * @param i32 The value to AND *pi32 with.
5448 *
5449 * @remarks x86: Requires a 386 or later.
5450 */
5451DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5452{
5453 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5454}
5455
5456
5457/**
5458 * Atomically And an unsigned 64-bit value, ordered.
5459 *
5460 * @param pu64 Pointer to the pointer variable to AND u64 with.
5461 * @param u64 The value to AND *pu64 with.
5462 *
5463 * @remarks x86: Requires a Pentium or later.
5464 */
5465#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5466DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5467#else
5468DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5469{
5470# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5471 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
5472
5473# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5474 __asm__ __volatile__("lock; andq %1, %0\n\t"
5475 : "=m" (*pu64)
5476 : "r" (u64)
5477 , "m" (*pu64)
5478 : "cc");
5479
5480# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5481# if defined(RTASM_ARM64_USE_FEAT_LSE)
5482# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5483 uint64_t u64Spill;
5484 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5485 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
5486 : [pMem] "+Q" (*pu64)
5487 , [uSpill] "=&r" (u64Spill)
5488 : [fBitsToClear] "r" (~u64)
5489 : );
5490# else
5491 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5492 RTASM_ARM_DMB_SY
5493 "stclr %[fBitsToClear], %[pMem]\n\t"
5494 : [pMem] "+Q" (*pu64)
5495 : [fBitsToClear] "r" (~u64)
5496 : );
5497# endif
5498# else
5499 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
5500 "and %[uNew], %[uNew], %[uVal]\n\t"
5501 ,
5502 "and %[uNew], %[uNew], %[uVal]\n\t"
5503 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5504 [uVal] "r" (u64));
5505# endif
5506
5507# else
5508 for (;;)
5509 {
5510 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5511 uint64_t u64New = u64Old & u64;
5512 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5513 break;
5514 ASMNopPause();
5515 }
5516# endif
5517}
5518#endif
5519
5520
5521/**
5522 * Atomically And a signed 64-bit value, ordered.
5523 *
5524 * @param pi64 Pointer to the pointer variable to AND i64 with.
5525 * @param i64 The value to AND *pi64 with.
5526 *
5527 * @remarks x86: Requires a Pentium or later.
5528 */
5529DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5530{
5531 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5532}
5533
5534
5535/**
5536 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
5537 *
5538 * @param pu32 Pointer to the variable to XOR @a u32 with.
5539 * @param u32 The value to XOR @a *pu32 with.
5540 *
5541 * @remarks x86: Requires a 386 or later.
5542 */
5543#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5544RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5545#else
5546DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5547{
5548# if RT_INLINE_ASM_USES_INTRIN
5549 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
5550
5551# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5552# if RT_INLINE_ASM_GNU_STYLE
5553 __asm__ __volatile__("lock; xorl %1, %0\n\t"
5554 : "=m" (*pu32)
5555 : "ir" (u32)
5556 , "m" (*pu32)
5557 : "cc");
5558# else
5559 __asm
5560 {
5561 mov eax, [u32]
5562# ifdef RT_ARCH_AMD64
5563 mov rdx, [pu32]
5564 lock xor [rdx], eax
5565# else
5566 mov edx, [pu32]
5567 lock xor [edx], eax
5568# endif
5569 }
5570# endif
5571
5572# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5573# if defined(RTASM_ARM64_USE_FEAT_LSE)
5574# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5575 uint32_t u32Spill;
5576 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5577 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
5578 : [pMem] "+Q" (*pu32)
5579 , [uSpill] "=&r" (u32Spill)
5580 : [fBitMask] "r" (u32)
5581 : );
5582# else
5583 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5584 RTASM_ARM_DMB_SY
5585 "steor %w[fBitMask], %[pMem]\n\t"
5586 : [pMem] "+Q" (*pu32)
5587 : [fBitMask] "r" (u32)
5588 : );
5589# endif
5590# else
5591 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
5592 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
5593 "eor %[uNew], %[uNew], %[uVal]\n\t",
5594 [uVal] "r" (u32));
5595# endif
5596
5597# else
5598# error "Port me"
5599# endif
5600}
5601#endif
5602
5603
5604/**
5605 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
5606 * extended version (for bitmaps).
5607 *
5608 * @returns Old value.
5609 * @param pu32 Pointer to the variable to XOR @a u32 with.
5610 * @param u32 The value to XOR @a *pu32 with.
5611 */
5612DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5613{
5614#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5615# if defined(RTASM_ARM64_USE_FEAT_LSE)
5616 uint32_t u32OldRet;
5617 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
5618# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5619 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5620# else
5621 RTASM_ARM_DMB_SY
5622 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5623# endif
5624 : [pMem] "+Q" (*pu32)
5625 , [uOldRet] "=&r" (u32OldRet)
5626 : [fBitMask] "r" (u32)
5627 : );
5628# else
5629 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
5630 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
5631 "eor %[uNew], %[uOld], %[uVal]\n\t",
5632 [uVal] "r" (u32));
5633# endif
5634 return u32OldRet;
5635
5636#else
5637 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5638 uint32_t u32New;
5639 do
5640 u32New = u32RetOld ^ u32;
5641 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5642 return u32RetOld;
5643#endif
5644}
5645
5646
5647/**
5648 * Atomically XOR a signed 32-bit value, ordered.
5649 *
5650 * @param pi32 Pointer to the variable to XOR i32 with.
5651 * @param i32 The value to XOR *pi32 with.
5652 *
5653 * @remarks x86: Requires a 386 or later.
5654 */
5655DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5656{
5657 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5658}
5659
5660
5661/**
5662 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
5663 *
5664 * @param pu32 Pointer to the pointer variable to OR u32 with.
5665 * @param u32 The value to OR *pu32 with.
5666 *
5667 * @remarks x86: Requires a 386 or later.
5668 */
5669#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5670RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5671#else
5672DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5673{
5674# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5675# if RT_INLINE_ASM_GNU_STYLE
5676 __asm__ __volatile__("orl %1, %0\n\t"
5677 : "=m" (*pu32)
5678 : "ir" (u32)
5679 , "m" (*pu32)
5680 : "cc");
5681# else
5682 __asm
5683 {
5684 mov eax, [u32]
5685# ifdef RT_ARCH_AMD64
5686 mov rdx, [pu32]
5687 or [rdx], eax
5688# else
5689 mov edx, [pu32]
5690 or [edx], eax
5691# endif
5692 }
5693# endif
5694
5695# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5696 /* M1 benchmark: stset=1974 vs non-lse=6271 */
5697# if defined(RTASM_ARM64_USE_FEAT_LSE)
5698 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
5699 "stset %w[fBitsToSet], %[pMem]\n\t"
5700 : [pMem] "+Q" (*pu32)
5701 : [fBitsToSet] "r" (u32)
5702 : );
5703# else
5704 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
5705 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5706 "orr %[uNew], %[uNew], %[uVal]\n\t",
5707 [uVal] "r" (u32));
5708# endif
5709
5710# else
5711# error "Port me"
5712# endif
5713}
5714#endif
5715
5716
5717/**
5718 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
5719 * extended version (for bitmap fallback).
5720 *
5721 * @returns Old value.
5722 * @param pu32 Pointer to the variable to OR @a u32 with.
5723 * @param u32 The value to OR @a *pu32 with.
5724 */
5725DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5726{
5727#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5728# if defined(RTASM_ARM64_USE_FEAT_LSE)
5729 uint32_t u32OldRet;
5730 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5731 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5732 : [pMem] "+Q" (*pu32)
5733 , [uOldRet] "=&r" (u32OldRet)
5734 : [fBitsToSet] "r" (u32)
5735 : );
5736# else
5737 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
5738 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5739 "orr %[uNew], %[uOld], %[uVal]\n\t",
5740 [uVal] "r" (u32));
5741# endif
5742 return u32OldRet;
5743
5744#else
5745 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5746#endif
5747}
5748
5749
5750/**
5751 * Atomically OR a signed 32-bit value, unordered.
5752 *
5753 * @param pi32 Pointer to the pointer variable to OR u32 with.
5754 * @param i32 The value to OR *pu32 with.
5755 *
5756 * @remarks x86: Requires a 386 or later.
5757 */
5758DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5759{
5760 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5761}
5762
5763
5764/**
5765 * Atomically OR an unsigned 64-bit value, unordered.
5766 *
5767 * @param pu64 Pointer to the pointer variable to OR u64 with.
5768 * @param u64 The value to OR *pu64 with.
5769 *
5770 * @remarks x86: Requires a Pentium or later.
5771 */
5772#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5773DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5774#else
5775DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5776{
5777# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5778 __asm__ __volatile__("orq %1, %q0\n\t"
5779 : "=m" (*pu64)
5780 : "r" (u64)
5781 , "m" (*pu64)
5782 : "cc");
5783
5784# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5785# if defined(RTASM_ARM64_USE_FEAT_LSE)
5786 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
5787 "stset %[fBitsToSet], %[pMem]\n\t"
5788 : [pMem] "+Q" (*pu64)
5789 : [fBitsToSet] "r" (u64)
5790 : );
5791# else
5792 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
5793 "orr %[uNew], %[uNew], %[uVal]\n\t"
5794 ,
5795 "orr %[uNew], %[uNew], %[uVal]\n\t"
5796 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5797 [uVal] "r" (u64));
5798# endif
5799
5800# else
5801 for (;;)
5802 {
5803 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5804 uint64_t u64New = u64Old | u64;
5805 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5806 break;
5807 ASMNopPause();
5808 }
5809# endif
5810}
5811#endif
5812
5813
5814/**
5815 * Atomically Or a signed 64-bit value, unordered.
5816 *
5817 * @param pi64 Pointer to the pointer variable to OR u64 with.
5818 * @param i64 The value to OR *pu64 with.
5819 *
5820 * @remarks x86: Requires a Pentium or later.
5821 */
5822DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5823{
5824 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5825}
5826
5827
5828/**
5829 * Atomically And an unsigned 32-bit value, unordered.
5830 *
5831 * @param pu32 Pointer to the pointer variable to AND u32 with.
5832 * @param u32 The value to AND *pu32 with.
5833 *
5834 * @remarks x86: Requires a 386 or later.
5835 */
5836#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5837RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5838#else
5839DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5840{
5841# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5842# if RT_INLINE_ASM_GNU_STYLE
5843 __asm__ __volatile__("andl %1, %0\n\t"
5844 : "=m" (*pu32)
5845 : "ir" (u32)
5846 , "m" (*pu32)
5847 : "cc");
5848# else
5849 __asm
5850 {
5851 mov eax, [u32]
5852# ifdef RT_ARCH_AMD64
5853 mov rdx, [pu32]
5854 and [rdx], eax
5855# else
5856 mov edx, [pu32]
5857 and [edx], eax
5858# endif
5859 }
5860# endif
5861
5862# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5863 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
5864# if defined(RTASM_ARM64_USE_FEAT_LSE)
5865 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
5866 "stclr %w[fBitsToClear], %[pMem]\n\t"
5867 : [pMem] "+Q" (*pu32)
5868 : [fBitsToClear] "r" (~u32)
5869 : );
5870# else
5871 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
5872 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5873 "and %[uNew], %[uNew], %[uVal]\n\t",
5874 [uVal] "r" (u32));
5875# endif
5876
5877# else
5878# error "Port me"
5879# endif
5880}
5881#endif
5882
5883
5884/**
5885 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
5886 * bitmap fallback).
5887 *
5888 * @returns Old value.
5889 * @param pu32 Pointer to the pointer to AND @a u32 with.
5890 * @param u32 The value to AND @a *pu32 with.
5891 */
5892DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5893{
5894#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5895# if defined(RTASM_ARM64_USE_FEAT_LSE)
5896 uint32_t u32OldRet;
5897 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5898 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5899 : [pMem] "+Q" (*pu32)
5900 , [uOldRet] "=&r" (u32OldRet)
5901 : [fBitsToClear] "r" (~u32)
5902 : );
5903# else
5904 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
5905 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5906 "and %[uNew], %[uOld], %[uVal]\n\t",
5907 [uVal] "r" (u32));
5908# endif
5909 return u32OldRet;
5910
5911#else
5912 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
5913#endif
5914}
5915
5916
5917/**
5918 * Atomically And a signed 32-bit value, unordered.
5919 *
5920 * @param pi32 Pointer to the pointer variable to AND i32 with.
5921 * @param i32 The value to AND *pi32 with.
5922 *
5923 * @remarks x86: Requires a 386 or later.
5924 */
5925DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5926{
5927 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5928}
5929
5930
5931/**
5932 * Atomically And an unsigned 64-bit value, unordered.
5933 *
5934 * @param pu64 Pointer to the pointer variable to AND u64 with.
5935 * @param u64 The value to AND *pu64 with.
5936 *
5937 * @remarks x86: Requires a Pentium or later.
5938 */
5939#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5940DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5941#else
5942DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5943{
5944# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5945 __asm__ __volatile__("andq %1, %0\n\t"
5946 : "=m" (*pu64)
5947 : "r" (u64)
5948 , "m" (*pu64)
5949 : "cc");
5950
5951# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5952# if defined(RTASM_ARM64_USE_FEAT_LSE)
5953 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
5954 "stclr %[fBitsToClear], %[pMem]\n\t"
5955 : [pMem] "+Q" (*pu64)
5956 : [fBitsToClear] "r" (~u64)
5957 : );
5958# else
5959 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
5960 "and %[uNew], %[uNew], %[uVal]\n\t"
5961 ,
5962 "and %[uNew], %[uNew], %[uVal]\n\t"
5963 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5964 [uVal] "r" (u64));
5965# endif
5966
5967# else
5968 for (;;)
5969 {
5970 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5971 uint64_t u64New = u64Old & u64;
5972 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5973 break;
5974 ASMNopPause();
5975 }
5976# endif
5977}
5978#endif
5979
5980
5981/**
5982 * Atomically And a signed 64-bit value, unordered.
5983 *
5984 * @param pi64 Pointer to the pointer variable to AND i64 with.
5985 * @param i64 The value to AND *pi64 with.
5986 *
5987 * @remarks x86: Requires a Pentium or later.
5988 */
5989DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5990{
5991 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5992}
5993
5994
5995/**
5996 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
5997 *
5998 * @param pu32 Pointer to the variable to XOR @a u32 with.
5999 * @param u32 The value to OR @a *pu32 with.
6000 *
6001 * @remarks x86: Requires a 386 or later.
6002 */
6003#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6004RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6005#else
6006DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6007{
6008# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6009# if RT_INLINE_ASM_GNU_STYLE
6010 __asm__ __volatile__("xorl %1, %0\n\t"
6011 : "=m" (*pu32)
6012 : "ir" (u32)
6013 , "m" (*pu32)
6014 : "cc");
6015# else
6016 __asm
6017 {
6018 mov eax, [u32]
6019# ifdef RT_ARCH_AMD64
6020 mov rdx, [pu32]
6021 xor [rdx], eax
6022# else
6023 mov edx, [pu32]
6024 xor [edx], eax
6025# endif
6026 }
6027# endif
6028
6029# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6030# if defined(RTASM_ARM64_USE_FEAT_LSE)
6031 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6032 "steor %w[fBitMask], %[pMem]\n\t"
6033 : [pMem] "+Q" (*pu32)
6034 : [fBitMask] "r" (u32)
6035 : );
6036# else
6037 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6038 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6039 "eor %[uNew], %[uNew], %[uVal]\n\t",
6040 [uVal] "r" (u32));
6041# endif
6042
6043# else
6044# error "Port me"
6045# endif
6046}
6047#endif
6048
6049
6050/**
6051 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6052 * extended version (for bitmap fallback).
6053 *
6054 * @returns Old value.
6055 * @param pu32 Pointer to the variable to XOR @a u32 with.
6056 * @param u32 The value to OR @a *pu32 with.
6057 */
6058DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6059{
6060#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6061# if defined(RTASM_ARM64_USE_FEAT_LSE)
6062 uint32_t u32OldRet;
6063 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6064 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6065 : [pMem] "+Q" (*pu32)
6066 , [uOldRet] "=&r" (u32OldRet)
6067 : [fBitMask] "r" (u32)
6068 : );
6069# else
6070 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6071 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6072 "eor %[uNew], %[uOld], %[uVal]\n\t",
6073 [uVal] "r" (u32));
6074# endif
6075 return u32OldRet;
6076
6077#else
6078 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6079#endif
6080}
6081
6082
6083/**
6084 * Atomically XOR a signed 32-bit value, unordered.
6085 *
6086 * @param pi32 Pointer to the variable to XOR @a u32 with.
6087 * @param i32 The value to XOR @a *pu32 with.
6088 *
6089 * @remarks x86: Requires a 386 or later.
6090 */
6091DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6092{
6093 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6094}
6095
6096
6097/**
6098 * Atomically increment an unsigned 32-bit value, unordered.
6099 *
6100 * @returns the new value.
6101 * @param pu32 Pointer to the variable to increment.
6102 *
6103 * @remarks x86: Requires a 486 or later.
6104 */
6105#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6106RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6107#else
6108DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6109{
6110# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6111 uint32_t u32;
6112# if RT_INLINE_ASM_GNU_STYLE
6113 __asm__ __volatile__("xaddl %0, %1\n\t"
6114 : "=r" (u32)
6115 , "=m" (*pu32)
6116 : "0" (1)
6117 , "m" (*pu32)
6118 : "memory" /** @todo why 'memory'? */
6119 , "cc");
6120 return u32 + 1;
6121# else
6122 __asm
6123 {
6124 mov eax, 1
6125# ifdef RT_ARCH_AMD64
6126 mov rdx, [pu32]
6127 xadd [rdx], eax
6128# else
6129 mov edx, [pu32]
6130 xadd [edx], eax
6131# endif
6132 mov u32, eax
6133 }
6134 return u32 + 1;
6135# endif
6136
6137# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6138 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6139# if defined(RTASM_ARM64_USE_FEAT_LSE)
6140 uint32_t u32NewRet;
6141 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6142 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6143 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6144 : [pMem] "+Q" (*pu32)
6145 , [uNewRet] "=&r" (u32NewRet)
6146 : [uAddend] "r" ((uint32_t)1)
6147 : );
6148# else
6149 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6150 "add %w[uNew], %w[uNew], #1\n\t",
6151 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6152 "X" (0) /* dummy */);
6153# endif
6154 return u32NewRet;
6155
6156# else
6157# error "Port me"
6158# endif
6159}
6160#endif
6161
6162
6163/**
6164 * Atomically decrement an unsigned 32-bit value, unordered.
6165 *
6166 * @returns the new value.
6167 * @param pu32 Pointer to the variable to decrement.
6168 *
6169 * @remarks x86: Requires a 486 or later.
6170 */
6171#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6172RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6173#else
6174DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6175{
6176# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6177 uint32_t u32;
6178# if RT_INLINE_ASM_GNU_STYLE
6179 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6180 : "=r" (u32)
6181 , "=m" (*pu32)
6182 : "0" (-1)
6183 , "m" (*pu32)
6184 : "memory"
6185 , "cc");
6186 return u32 - 1;
6187# else
6188 __asm
6189 {
6190 mov eax, -1
6191# ifdef RT_ARCH_AMD64
6192 mov rdx, [pu32]
6193 xadd [rdx], eax
6194# else
6195 mov edx, [pu32]
6196 xadd [edx], eax
6197# endif
6198 mov u32, eax
6199 }
6200 return u32 - 1;
6201# endif
6202
6203# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6204 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6205# if defined(RTASM_ARM64_USE_FEAT_LSE)
6206 uint32_t u32NewRet;
6207 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6208 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6209 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6210 : [pMem] "+Q" (*pu32)
6211 , [uNewRet] "=&r" (u32NewRet)
6212 : [uAddend] "r" (~(uint32_t)0)
6213 : );
6214# else
6215 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6216 "sub %w[uNew], %w[uNew], #1\n\t",
6217 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6218 "X" (0) /* dummy */);
6219# endif
6220 return u32NewRet;
6221
6222# else
6223# error "Port me"
6224# endif
6225}
6226#endif
6227
6228
6229/**
6230 * Reverse the byte order of the given 16-bit integer.
6231 *
6232 * @returns Revert
6233 * @param u16 16-bit integer value.
6234 */
6235#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6236RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6237#else
6238DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6239{
6240# if RT_INLINE_ASM_USES_INTRIN
6241 return _byteswap_ushort(u16);
6242
6243# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6244# if RT_INLINE_ASM_GNU_STYLE
6245 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6246# else
6247 _asm
6248 {
6249 mov ax, [u16]
6250 ror ax, 8
6251 mov [u16], ax
6252 }
6253# endif
6254 return u16;
6255
6256# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6257 uint32_t u32Ret;
6258 __asm__ __volatile__(
6259# if defined(RT_ARCH_ARM64)
6260 "rev16 %w[uRet], %w[uVal]\n\t"
6261# else
6262 "rev16 %[uRet], %[uVal]\n\t"
6263# endif
6264 : [uRet] "=r" (u32Ret)
6265 : [uVal] "r" (u16));
6266 return (uint16_t)u32Ret;
6267
6268# else
6269# error "Port me"
6270# endif
6271}
6272#endif
6273
6274
6275/**
6276 * Reverse the byte order of the given 32-bit integer.
6277 *
6278 * @returns Revert
6279 * @param u32 32-bit integer value.
6280 */
6281#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6282RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6283#else
6284DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6285{
6286# if RT_INLINE_ASM_USES_INTRIN
6287 return _byteswap_ulong(u32);
6288
6289# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6290# if RT_INLINE_ASM_GNU_STYLE
6291 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6292# else
6293 _asm
6294 {
6295 mov eax, [u32]
6296 bswap eax
6297 mov [u32], eax
6298 }
6299# endif
6300 return u32;
6301
6302# elif defined(RT_ARCH_ARM64)
6303 uint64_t u64Ret;
6304 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6305 : [uRet] "=r" (u64Ret)
6306 : [uVal] "r" ((uint64_t)u32));
6307 return (uint32_t)u64Ret;
6308
6309# elif defined(RT_ARCH_ARM32)
6310 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6311 : [uRet] "=r" (u32)
6312 : [uVal] "[uRet]" (u32));
6313 return u32;
6314
6315# else
6316# error "Port me"
6317# endif
6318}
6319#endif
6320
6321
6322/**
6323 * Reverse the byte order of the given 64-bit integer.
6324 *
6325 * @returns Revert
6326 * @param u64 64-bit integer value.
6327 */
6328DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
6329{
6330#if defined(RT_ARCH_AMD64) && RT_INLINE_ASM_USES_INTRIN
6331 return _byteswap_uint64(u64);
6332
6333# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6334 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
6335 return u64;
6336
6337# elif defined(RT_ARCH_ARM64)
6338 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6339 : [uRet] "=r" (u64)
6340 : [uVal] "[uRet]" (u64));
6341 return u64;
6342
6343#else
6344 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
6345 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
6346#endif
6347}
6348
6349
6350
6351/** @defgroup grp_inline_bits Bit Operations
6352 * @{
6353 */
6354
6355
6356/**
6357 * Sets a bit in a bitmap.
6358 *
6359 * @param pvBitmap Pointer to the bitmap (little endian). This should be
6360 * 32-bit aligned.
6361 * @param iBit The bit to set.
6362 *
6363 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6364 * However, doing so will yield better performance as well as avoiding
6365 * traps accessing the last bits in the bitmap.
6366 */
6367#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6368RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6369#else
6370DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6371{
6372# if RT_INLINE_ASM_USES_INTRIN
6373 _bittestandset((long RT_FAR *)pvBitmap, iBit);
6374
6375# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6376# if RT_INLINE_ASM_GNU_STYLE
6377 __asm__ __volatile__("btsl %1, %0"
6378 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6379 : "Ir" (iBit)
6380 , "m" (*(volatile long RT_FAR *)pvBitmap)
6381 : "memory"
6382 , "cc");
6383# else
6384 __asm
6385 {
6386# ifdef RT_ARCH_AMD64
6387 mov rax, [pvBitmap]
6388 mov edx, [iBit]
6389 bts [rax], edx
6390# else
6391 mov eax, [pvBitmap]
6392 mov edx, [iBit]
6393 bts [eax], edx
6394# endif
6395 }
6396# endif
6397
6398# else
6399 int32_t offBitmap = iBit / 32;
6400 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6401 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6402# endif
6403}
6404#endif
6405
6406
6407/**
6408 * Atomically sets a bit in a bitmap, ordered.
6409 *
6410 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6411 * aligned, otherwise the memory access isn't atomic!
6412 * @param iBit The bit to set.
6413 *
6414 * @remarks x86: Requires a 386 or later.
6415 */
6416#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6417RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6418#else
6419DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6420{
6421 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6422# if RT_INLINE_ASM_USES_INTRIN
6423 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
6424# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6425# if RT_INLINE_ASM_GNU_STYLE
6426 __asm__ __volatile__("lock; btsl %1, %0"
6427 : "=m" (*(volatile long *)pvBitmap)
6428 : "Ir" (iBit)
6429 , "m" (*(volatile long *)pvBitmap)
6430 : "memory"
6431 , "cc");
6432# else
6433 __asm
6434 {
6435# ifdef RT_ARCH_AMD64
6436 mov rax, [pvBitmap]
6437 mov edx, [iBit]
6438 lock bts [rax], edx
6439# else
6440 mov eax, [pvBitmap]
6441 mov edx, [iBit]
6442 lock bts [eax], edx
6443# endif
6444 }
6445# endif
6446
6447# else
6448 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6449# endif
6450}
6451#endif
6452
6453
6454/**
6455 * Clears a bit in a bitmap.
6456 *
6457 * @param pvBitmap Pointer to the bitmap (little endian).
6458 * @param iBit The bit to clear.
6459 *
6460 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6461 * However, doing so will yield better performance as well as avoiding
6462 * traps accessing the last bits in the bitmap.
6463 */
6464#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6465RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6466#else
6467DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6468{
6469# if RT_INLINE_ASM_USES_INTRIN
6470 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
6471
6472# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6473# if RT_INLINE_ASM_GNU_STYLE
6474 __asm__ __volatile__("btrl %1, %0"
6475 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6476 : "Ir" (iBit)
6477 , "m" (*(volatile long RT_FAR *)pvBitmap)
6478 : "memory"
6479 , "cc");
6480# else
6481 __asm
6482 {
6483# ifdef RT_ARCH_AMD64
6484 mov rax, [pvBitmap]
6485 mov edx, [iBit]
6486 btr [rax], edx
6487# else
6488 mov eax, [pvBitmap]
6489 mov edx, [iBit]
6490 btr [eax], edx
6491# endif
6492 }
6493# endif
6494
6495# else
6496 int32_t offBitmap = iBit / 32;
6497 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6498 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6499# endif
6500}
6501#endif
6502
6503
6504/**
6505 * Atomically clears a bit in a bitmap, ordered.
6506 *
6507 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6508 * aligned, otherwise the memory access isn't atomic!
6509 * @param iBit The bit to toggle set.
6510 *
6511 * @remarks No memory barrier, take care on smp.
6512 * @remarks x86: Requires a 386 or later.
6513 */
6514#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6515RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6516#else
6517DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6518{
6519 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6520# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6521# if RT_INLINE_ASM_GNU_STYLE
6522 __asm__ __volatile__("lock; btrl %1, %0"
6523 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6524 : "Ir" (iBit)
6525 , "m" (*(volatile long RT_FAR *)pvBitmap)
6526 : "memory"
6527 , "cc");
6528# else
6529 __asm
6530 {
6531# ifdef RT_ARCH_AMD64
6532 mov rax, [pvBitmap]
6533 mov edx, [iBit]
6534 lock btr [rax], edx
6535# else
6536 mov eax, [pvBitmap]
6537 mov edx, [iBit]
6538 lock btr [eax], edx
6539# endif
6540 }
6541# endif
6542# else
6543 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6544# endif
6545}
6546#endif
6547
6548
6549/**
6550 * Toggles a bit in a bitmap.
6551 *
6552 * @param pvBitmap Pointer to the bitmap (little endian).
6553 * @param iBit The bit to toggle.
6554 *
6555 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6556 * However, doing so will yield better performance as well as avoiding
6557 * traps accessing the last bits in the bitmap.
6558 */
6559#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6560RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6561#else
6562DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6563{
6564# if RT_INLINE_ASM_USES_INTRIN
6565 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
6566# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6567# if RT_INLINE_ASM_GNU_STYLE
6568 __asm__ __volatile__("btcl %1, %0"
6569 : "=m" (*(volatile long *)pvBitmap)
6570 : "Ir" (iBit)
6571 , "m" (*(volatile long *)pvBitmap)
6572 : "memory"
6573 , "cc");
6574# else
6575 __asm
6576 {
6577# ifdef RT_ARCH_AMD64
6578 mov rax, [pvBitmap]
6579 mov edx, [iBit]
6580 btc [rax], edx
6581# else
6582 mov eax, [pvBitmap]
6583 mov edx, [iBit]
6584 btc [eax], edx
6585# endif
6586 }
6587# endif
6588# else
6589 int32_t offBitmap = iBit / 32;
6590 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6591 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6592# endif
6593}
6594#endif
6595
6596
6597/**
6598 * Atomically toggles a bit in a bitmap, ordered.
6599 *
6600 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6601 * aligned, otherwise the memory access isn't atomic!
6602 * @param iBit The bit to test and set.
6603 *
6604 * @remarks x86: Requires a 386 or later.
6605 */
6606#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6607RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6608#else
6609DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6610{
6611 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6612# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6613# if RT_INLINE_ASM_GNU_STYLE
6614 __asm__ __volatile__("lock; btcl %1, %0"
6615 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6616 : "Ir" (iBit)
6617 , "m" (*(volatile long RT_FAR *)pvBitmap)
6618 : "memory"
6619 , "cc");
6620# else
6621 __asm
6622 {
6623# ifdef RT_ARCH_AMD64
6624 mov rax, [pvBitmap]
6625 mov edx, [iBit]
6626 lock btc [rax], edx
6627# else
6628 mov eax, [pvBitmap]
6629 mov edx, [iBit]
6630 lock btc [eax], edx
6631# endif
6632 }
6633# endif
6634# else
6635 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6636# endif
6637}
6638#endif
6639
6640
6641/**
6642 * Tests and sets a bit in a bitmap.
6643 *
6644 * @returns true if the bit was set.
6645 * @returns false if the bit was clear.
6646 *
6647 * @param pvBitmap Pointer to the bitmap (little endian).
6648 * @param iBit The bit to test and set.
6649 *
6650 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6651 * However, doing so will yield better performance as well as avoiding
6652 * traps accessing the last bits in the bitmap.
6653 */
6654#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6655RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6656#else
6657DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6658{
6659 union { bool f; uint32_t u32; uint8_t u8; } rc;
6660# if RT_INLINE_ASM_USES_INTRIN
6661 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
6662
6663# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6664# if RT_INLINE_ASM_GNU_STYLE
6665 __asm__ __volatile__("btsl %2, %1\n\t"
6666 "setc %b0\n\t"
6667 "andl $1, %0\n\t"
6668 : "=q" (rc.u32)
6669 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6670 : "Ir" (iBit)
6671 , "m" (*(volatile long RT_FAR *)pvBitmap)
6672 : "memory"
6673 , "cc");
6674# else
6675 __asm
6676 {
6677 mov edx, [iBit]
6678# ifdef RT_ARCH_AMD64
6679 mov rax, [pvBitmap]
6680 bts [rax], edx
6681# else
6682 mov eax, [pvBitmap]
6683 bts [eax], edx
6684# endif
6685 setc al
6686 and eax, 1
6687 mov [rc.u32], eax
6688 }
6689# endif
6690
6691# else
6692 int32_t offBitmap = iBit / 32;
6693 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6694 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6695 >> (iBit & 31);
6696 rc.u32 &= 1;
6697# endif
6698 return rc.f;
6699}
6700#endif
6701
6702
6703/**
6704 * Atomically tests and sets a bit in a bitmap, ordered.
6705 *
6706 * @returns true if the bit was set.
6707 * @returns false if the bit was clear.
6708 *
6709 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6710 * aligned, otherwise the memory access isn't atomic!
6711 * @param iBit The bit to set.
6712 *
6713 * @remarks x86: Requires a 386 or later.
6714 */
6715#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6716RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6717#else
6718DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6719{
6720 union { bool f; uint32_t u32; uint8_t u8; } rc;
6721 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6722# if RT_INLINE_ASM_USES_INTRIN
6723 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
6724# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6725# if RT_INLINE_ASM_GNU_STYLE
6726 __asm__ __volatile__("lock; btsl %2, %1\n\t"
6727 "setc %b0\n\t"
6728 "andl $1, %0\n\t"
6729 : "=q" (rc.u32)
6730 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6731 : "Ir" (iBit)
6732 , "m" (*(volatile long RT_FAR *)pvBitmap)
6733 : "memory"
6734 , "cc");
6735# else
6736 __asm
6737 {
6738 mov edx, [iBit]
6739# ifdef RT_ARCH_AMD64
6740 mov rax, [pvBitmap]
6741 lock bts [rax], edx
6742# else
6743 mov eax, [pvBitmap]
6744 lock bts [eax], edx
6745# endif
6746 setc al
6747 and eax, 1
6748 mov [rc.u32], eax
6749 }
6750# endif
6751
6752# else
6753 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6754 >> (iBit & 31);
6755 rc.u32 &= 1;
6756# endif
6757 return rc.f;
6758}
6759#endif
6760
6761
6762/**
6763 * Tests and clears a bit in a bitmap.
6764 *
6765 * @returns true if the bit was set.
6766 * @returns false if the bit was clear.
6767 *
6768 * @param pvBitmap Pointer to the bitmap (little endian).
6769 * @param iBit The bit to test and clear.
6770 *
6771 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6772 * However, doing so will yield better performance as well as avoiding
6773 * traps accessing the last bits in the bitmap.
6774 */
6775#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6776RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6777#else
6778DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6779{
6780 union { bool f; uint32_t u32; uint8_t u8; } rc;
6781# if RT_INLINE_ASM_USES_INTRIN
6782 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
6783
6784# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6785# if RT_INLINE_ASM_GNU_STYLE
6786 __asm__ __volatile__("btrl %2, %1\n\t"
6787 "setc %b0\n\t"
6788 "andl $1, %0\n\t"
6789 : "=q" (rc.u32)
6790 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6791 : "Ir" (iBit)
6792 , "m" (*(volatile long RT_FAR *)pvBitmap)
6793 : "memory"
6794 , "cc");
6795# else
6796 __asm
6797 {
6798 mov edx, [iBit]
6799# ifdef RT_ARCH_AMD64
6800 mov rax, [pvBitmap]
6801 btr [rax], edx
6802# else
6803 mov eax, [pvBitmap]
6804 btr [eax], edx
6805# endif
6806 setc al
6807 and eax, 1
6808 mov [rc.u32], eax
6809 }
6810# endif
6811
6812# else
6813 int32_t offBitmap = iBit / 32;
6814 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6815 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
6816 >> (iBit & 31);
6817 rc.u32 &= 1;
6818# endif
6819 return rc.f;
6820}
6821#endif
6822
6823
6824/**
6825 * Atomically tests and clears a bit in a bitmap, ordered.
6826 *
6827 * @returns true if the bit was set.
6828 * @returns false if the bit was clear.
6829 *
6830 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6831 * aligned, otherwise the memory access isn't atomic!
6832 * @param iBit The bit to test and clear.
6833 *
6834 * @remarks No memory barrier, take care on smp.
6835 * @remarks x86: Requires a 386 or later.
6836 */
6837#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6838RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6839#else
6840DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6841{
6842 union { bool f; uint32_t u32; uint8_t u8; } rc;
6843 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6844# if RT_INLINE_ASM_USES_INTRIN
6845 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
6846
6847# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6848# if RT_INLINE_ASM_GNU_STYLE
6849 __asm__ __volatile__("lock; btrl %2, %1\n\t"
6850 "setc %b0\n\t"
6851 "andl $1, %0\n\t"
6852 : "=q" (rc.u32)
6853 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6854 : "Ir" (iBit)
6855 , "m" (*(volatile long RT_FAR *)pvBitmap)
6856 : "memory"
6857 , "cc");
6858# else
6859 __asm
6860 {
6861 mov edx, [iBit]
6862# ifdef RT_ARCH_AMD64
6863 mov rax, [pvBitmap]
6864 lock btr [rax], edx
6865# else
6866 mov eax, [pvBitmap]
6867 lock btr [eax], edx
6868# endif
6869 setc al
6870 and eax, 1
6871 mov [rc.u32], eax
6872 }
6873# endif
6874
6875# else
6876 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
6877 >> (iBit & 31);
6878 rc.u32 &= 1;
6879# endif
6880 return rc.f;
6881}
6882#endif
6883
6884
6885/**
6886 * Tests and toggles a bit in a bitmap.
6887 *
6888 * @returns true if the bit was set.
6889 * @returns false if the bit was clear.
6890 *
6891 * @param pvBitmap Pointer to the bitmap (little endian).
6892 * @param iBit The bit to test and toggle.
6893 *
6894 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6895 * However, doing so will yield better performance as well as avoiding
6896 * traps accessing the last bits in the bitmap.
6897 */
6898#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6899RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6900#else
6901DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6902{
6903 union { bool f; uint32_t u32; uint8_t u8; } rc;
6904# if RT_INLINE_ASM_USES_INTRIN
6905 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
6906
6907# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6908# if RT_INLINE_ASM_GNU_STYLE
6909 __asm__ __volatile__("btcl %2, %1\n\t"
6910 "setc %b0\n\t"
6911 "andl $1, %0\n\t"
6912 : "=q" (rc.u32)
6913 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6914 : "Ir" (iBit)
6915 , "m" (*(volatile long RT_FAR *)pvBitmap)
6916 : "memory"
6917 , "cc");
6918# else
6919 __asm
6920 {
6921 mov edx, [iBit]
6922# ifdef RT_ARCH_AMD64
6923 mov rax, [pvBitmap]
6924 btc [rax], edx
6925# else
6926 mov eax, [pvBitmap]
6927 btc [eax], edx
6928# endif
6929 setc al
6930 and eax, 1
6931 mov [rc.u32], eax
6932 }
6933# endif
6934
6935# else
6936 int32_t offBitmap = iBit / 32;
6937 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6938 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
6939 >> (iBit & 31);
6940 rc.u32 &= 1;
6941# endif
6942 return rc.f;
6943}
6944#endif
6945
6946
6947/**
6948 * Atomically tests and toggles a bit in a bitmap, ordered.
6949 *
6950 * @returns true if the bit was set.
6951 * @returns false if the bit was clear.
6952 *
6953 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6954 * aligned, otherwise the memory access isn't atomic!
6955 * @param iBit The bit to test and toggle.
6956 *
6957 * @remarks x86: Requires a 386 or later.
6958 */
6959#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6960RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6961#else
6962DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6963{
6964 union { bool f; uint32_t u32; uint8_t u8; } rc;
6965 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6966# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6967# if RT_INLINE_ASM_GNU_STYLE
6968 __asm__ __volatile__("lock; btcl %2, %1\n\t"
6969 "setc %b0\n\t"
6970 "andl $1, %0\n\t"
6971 : "=q" (rc.u32)
6972 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6973 : "Ir" (iBit)
6974 , "m" (*(volatile long RT_FAR *)pvBitmap)
6975 : "memory"
6976 , "cc");
6977# else
6978 __asm
6979 {
6980 mov edx, [iBit]
6981# ifdef RT_ARCH_AMD64
6982 mov rax, [pvBitmap]
6983 lock btc [rax], edx
6984# else
6985 mov eax, [pvBitmap]
6986 lock btc [eax], edx
6987# endif
6988 setc al
6989 and eax, 1
6990 mov [rc.u32], eax
6991 }
6992# endif
6993
6994# else
6995 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
6996 >> (iBit & 31);
6997 rc.u32 &= 1;
6998# endif
6999 return rc.f;
7000}
7001#endif
7002
7003
7004/**
7005 * Tests if a bit in a bitmap is set.
7006 *
7007 * @returns true if the bit is set.
7008 * @returns false if the bit is clear.
7009 *
7010 * @param pvBitmap Pointer to the bitmap (little endian).
7011 * @param iBit The bit to test.
7012 *
7013 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7014 * However, doing so will yield better performance as well as avoiding
7015 * traps accessing the last bits in the bitmap.
7016 */
7017#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7018RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7019#else
7020DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7021{
7022 union { bool f; uint32_t u32; uint8_t u8; } rc;
7023# if RT_INLINE_ASM_USES_INTRIN
7024 rc.u32 = _bittest((long *)pvBitmap, iBit);
7025
7026# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7027# if RT_INLINE_ASM_GNU_STYLE
7028
7029 __asm__ __volatile__("btl %2, %1\n\t"
7030 "setc %b0\n\t"
7031 "andl $1, %0\n\t"
7032 : "=q" (rc.u32)
7033 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7034 , "Ir" (iBit)
7035 : "memory"
7036 , "cc");
7037# else
7038 __asm
7039 {
7040 mov edx, [iBit]
7041# ifdef RT_ARCH_AMD64
7042 mov rax, [pvBitmap]
7043 bt [rax], edx
7044# else
7045 mov eax, [pvBitmap]
7046 bt [eax], edx
7047# endif
7048 setc al
7049 and eax, 1
7050 mov [rc.u32], eax
7051 }
7052# endif
7053
7054# else
7055 int32_t offBitmap = iBit / 32;
7056 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7057 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7058 rc.u32 &= 1;
7059# endif
7060 return rc.f;
7061}
7062#endif
7063
7064
7065#ifdef IPRT_INCLUDED_asm_mem_h
7066
7067/**
7068 * Clears a bit range within a bitmap.
7069 *
7070 * @param pvBitmap Pointer to the bitmap (little endian).
7071 * @param iBitStart The First bit to clear.
7072 * @param iBitEnd The first bit not to clear.
7073 */
7074DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7075{
7076 if (iBitStart < iBitEnd)
7077 {
7078 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7079 size_t iStart = iBitStart & ~(size_t)31;
7080 size_t iEnd = iBitEnd & ~(size_t)31;
7081 if (iStart == iEnd)
7082 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7083 else
7084 {
7085 /* bits in first dword. */
7086 if (iBitStart & 31)
7087 {
7088 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7089 pu32++;
7090 iBitStart = iStart + 32;
7091 }
7092
7093 /* whole dwords. */
7094 if (iBitStart != iEnd)
7095 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7096
7097 /* bits in last dword. */
7098 if (iBitEnd & 31)
7099 {
7100 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7101 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7102 }
7103 }
7104 }
7105}
7106
7107
7108/**
7109 * Sets a bit range within a bitmap.
7110 *
7111 * @param pvBitmap Pointer to the bitmap (little endian).
7112 * @param iBitStart The First bit to set.
7113 * @param iBitEnd The first bit not to set.
7114 */
7115DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7116{
7117 if (iBitStart < iBitEnd)
7118 {
7119 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7120 size_t iStart = iBitStart & ~(size_t)31;
7121 size_t iEnd = iBitEnd & ~(size_t)31;
7122 if (iStart == iEnd)
7123 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7124 else
7125 {
7126 /* bits in first dword. */
7127 if (iBitStart & 31)
7128 {
7129 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7130 pu32++;
7131 iBitStart = iStart + 32;
7132 }
7133
7134 /* whole dword. */
7135 if (iBitStart != iEnd)
7136 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7137
7138 /* bits in last dword. */
7139 if (iBitEnd & 31)
7140 {
7141 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7142 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7143 }
7144 }
7145 }
7146}
7147
7148#endif /* IPRT_INCLUDED_asm_mem_h */
7149
7150/**
7151 * Finds the first clear bit in a bitmap.
7152 *
7153 * @returns Index of the first zero bit.
7154 * @returns -1 if no clear bit was found.
7155 * @param pvBitmap Pointer to the bitmap (little endian).
7156 * @param cBits The number of bits in the bitmap. Multiple of 32.
7157 */
7158#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7159DECLASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7160#else
7161DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7162{
7163 if (cBits)
7164 {
7165 int32_t iBit;
7166# if RT_INLINE_ASM_GNU_STYLE
7167 RTCCUINTREG uEAX, uECX, uEDI;
7168 cBits = RT_ALIGN_32(cBits, 32);
7169 __asm__ __volatile__("repe; scasl\n\t"
7170 "je 1f\n\t"
7171# ifdef RT_ARCH_AMD64
7172 "lea -4(%%rdi), %%rdi\n\t"
7173 "xorl (%%rdi), %%eax\n\t"
7174 "subq %5, %%rdi\n\t"
7175# else
7176 "lea -4(%%edi), %%edi\n\t"
7177 "xorl (%%edi), %%eax\n\t"
7178 "subl %5, %%edi\n\t"
7179# endif
7180 "shll $3, %%edi\n\t"
7181 "bsfl %%eax, %%edx\n\t"
7182 "addl %%edi, %%edx\n\t"
7183 "1:\t\n"
7184 : "=d" (iBit)
7185 , "=&c" (uECX)
7186 , "=&D" (uEDI)
7187 , "=&a" (uEAX)
7188 : "0" (0xffffffff)
7189 , "mr" (pvBitmap)
7190 , "1" (cBits >> 5)
7191 , "2" (pvBitmap)
7192 , "3" (0xffffffff)
7193 : "cc");
7194# else
7195 cBits = RT_ALIGN_32(cBits, 32);
7196 __asm
7197 {
7198# ifdef RT_ARCH_AMD64
7199 mov rdi, [pvBitmap]
7200 mov rbx, rdi
7201# else
7202 mov edi, [pvBitmap]
7203 mov ebx, edi
7204# endif
7205 mov edx, 0ffffffffh
7206 mov eax, edx
7207 mov ecx, [cBits]
7208 shr ecx, 5
7209 repe scasd
7210 je done
7211
7212# ifdef RT_ARCH_AMD64
7213 lea rdi, [rdi - 4]
7214 xor eax, [rdi]
7215 sub rdi, rbx
7216# else
7217 lea edi, [edi - 4]
7218 xor eax, [edi]
7219 sub edi, ebx
7220# endif
7221 shl edi, 3
7222 bsf edx, eax
7223 add edx, edi
7224 done:
7225 mov [iBit], edx
7226 }
7227# endif
7228 return iBit;
7229 }
7230 return -1;
7231}
7232#endif
7233
7234
7235/**
7236 * Finds the next clear bit in a bitmap.
7237 *
7238 * @returns Index of the first zero bit.
7239 * @returns -1 if no clear bit was found.
7240 * @param pvBitmap Pointer to the bitmap (little endian).
7241 * @param cBits The number of bits in the bitmap. Multiple of 32.
7242 * @param iBitPrev The bit returned from the last search.
7243 * The search will start at iBitPrev + 1.
7244 */
7245#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7246DECLASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7247#else
7248DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7249{
7250 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7251 int iBit = ++iBitPrev & 31;
7252 if (iBit)
7253 {
7254 /*
7255 * Inspect the 32-bit word containing the unaligned bit.
7256 */
7257 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7258
7259# if RT_INLINE_ASM_USES_INTRIN
7260 unsigned long ulBit = 0;
7261 if (_BitScanForward(&ulBit, u32))
7262 return ulBit + iBitPrev;
7263# else
7264# if RT_INLINE_ASM_GNU_STYLE
7265 __asm__ __volatile__("bsf %1, %0\n\t"
7266 "jnz 1f\n\t"
7267 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7268 "1:\n\t"
7269 : "=r" (iBit)
7270 : "r" (u32)
7271 : "cc");
7272# else
7273 __asm
7274 {
7275 mov edx, [u32]
7276 bsf eax, edx
7277 jnz done
7278 mov eax, 0ffffffffh
7279 done:
7280 mov [iBit], eax
7281 }
7282# endif
7283 if (iBit >= 0)
7284 return iBit + (int)iBitPrev;
7285# endif
7286
7287 /*
7288 * Skip ahead and see if there is anything left to search.
7289 */
7290 iBitPrev |= 31;
7291 iBitPrev++;
7292 if (cBits <= (uint32_t)iBitPrev)
7293 return -1;
7294 }
7295
7296 /*
7297 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7298 */
7299 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7300 if (iBit >= 0)
7301 iBit += iBitPrev;
7302 return iBit;
7303}
7304#endif
7305
7306
7307/**
7308 * Finds the first set bit in a bitmap.
7309 *
7310 * @returns Index of the first set bit.
7311 * @returns -1 if no clear bit was found.
7312 * @param pvBitmap Pointer to the bitmap (little endian).
7313 * @param cBits The number of bits in the bitmap. Multiple of 32.
7314 */
7315#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7316DECLASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7317#else
7318DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7319{
7320 if (cBits)
7321 {
7322 int32_t iBit;
7323# if RT_INLINE_ASM_GNU_STYLE
7324 RTCCUINTREG uEAX, uECX, uEDI;
7325 cBits = RT_ALIGN_32(cBits, 32);
7326 __asm__ __volatile__("repe; scasl\n\t"
7327 "je 1f\n\t"
7328# ifdef RT_ARCH_AMD64
7329 "lea -4(%%rdi), %%rdi\n\t"
7330 "movl (%%rdi), %%eax\n\t"
7331 "subq %5, %%rdi\n\t"
7332# else
7333 "lea -4(%%edi), %%edi\n\t"
7334 "movl (%%edi), %%eax\n\t"
7335 "subl %5, %%edi\n\t"
7336# endif
7337 "shll $3, %%edi\n\t"
7338 "bsfl %%eax, %%edx\n\t"
7339 "addl %%edi, %%edx\n\t"
7340 "1:\t\n"
7341 : "=d" (iBit)
7342 , "=&c" (uECX)
7343 , "=&D" (uEDI)
7344 , "=&a" (uEAX)
7345 : "0" (0xffffffff)
7346 , "mr" (pvBitmap)
7347 , "1" (cBits >> 5)
7348 , "2" (pvBitmap)
7349 , "3" (0)
7350 : "cc");
7351# else
7352 cBits = RT_ALIGN_32(cBits, 32);
7353 __asm
7354 {
7355# ifdef RT_ARCH_AMD64
7356 mov rdi, [pvBitmap]
7357 mov rbx, rdi
7358# else
7359 mov edi, [pvBitmap]
7360 mov ebx, edi
7361# endif
7362 mov edx, 0ffffffffh
7363 xor eax, eax
7364 mov ecx, [cBits]
7365 shr ecx, 5
7366 repe scasd
7367 je done
7368# ifdef RT_ARCH_AMD64
7369 lea rdi, [rdi - 4]
7370 mov eax, [rdi]
7371 sub rdi, rbx
7372# else
7373 lea edi, [edi - 4]
7374 mov eax, [edi]
7375 sub edi, ebx
7376# endif
7377 shl edi, 3
7378 bsf edx, eax
7379 add edx, edi
7380 done:
7381 mov [iBit], edx
7382 }
7383# endif
7384 return iBit;
7385 }
7386 return -1;
7387}
7388#endif
7389
7390
7391/**
7392 * Finds the next set bit in a bitmap.
7393 *
7394 * @returns Index of the next set bit.
7395 * @returns -1 if no set bit was found.
7396 * @param pvBitmap Pointer to the bitmap (little endian).
7397 * @param cBits The number of bits in the bitmap. Multiple of 32.
7398 * @param iBitPrev The bit returned from the last search.
7399 * The search will start at iBitPrev + 1.
7400 */
7401#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7402DECLASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7403#else
7404DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7405{
7406 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7407 int iBit = ++iBitPrev & 31;
7408 if (iBit)
7409 {
7410 /*
7411 * Inspect the 32-bit word containing the unaligned bit.
7412 */
7413 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
7414
7415# if RT_INLINE_ASM_USES_INTRIN
7416 unsigned long ulBit = 0;
7417 if (_BitScanForward(&ulBit, u32))
7418 return ulBit + iBitPrev;
7419# else
7420# if RT_INLINE_ASM_GNU_STYLE
7421 __asm__ __volatile__("bsf %1, %0\n\t"
7422 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
7423 "movl $-1, %0\n\t"
7424 "1:\n\t"
7425 : "=r" (iBit)
7426 : "r" (u32)
7427 : "cc");
7428# else
7429 __asm
7430 {
7431 mov edx, [u32]
7432 bsf eax, edx
7433 jnz done
7434 mov eax, 0ffffffffh
7435 done:
7436 mov [iBit], eax
7437 }
7438# endif
7439 if (iBit >= 0)
7440 return iBit + (int)iBitPrev;
7441# endif
7442
7443 /*
7444 * Skip ahead and see if there is anything left to search.
7445 */
7446 iBitPrev |= 31;
7447 iBitPrev++;
7448 if (cBits <= (uint32_t)iBitPrev)
7449 return -1;
7450 }
7451
7452 /*
7453 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7454 */
7455 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7456 if (iBit >= 0)
7457 iBit += iBitPrev;
7458 return iBit;
7459}
7460#endif
7461
7462
7463/**
7464 * Finds the first bit which is set in the given 32-bit integer.
7465 * Bits are numbered from 1 (least significant) to 32.
7466 *
7467 * @returns index [1..32] of the first set bit.
7468 * @returns 0 if all bits are cleared.
7469 * @param u32 Integer to search for set bits.
7470 * @remarks Similar to ffs() in BSD.
7471 */
7472#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7473RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7474#else
7475DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
7476{
7477# if RT_INLINE_ASM_USES_INTRIN
7478 unsigned long iBit;
7479 if (_BitScanForward(&iBit, u32))
7480 iBit++;
7481 else
7482 iBit = 0;
7483
7484# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7485# if RT_INLINE_ASM_GNU_STYLE
7486 uint32_t iBit;
7487 __asm__ __volatile__("bsf %1, %0\n\t"
7488 "jnz 1f\n\t"
7489 "xorl %0, %0\n\t"
7490 "jmp 2f\n"
7491 "1:\n\t"
7492 "incl %0\n"
7493 "2:\n\t"
7494 : "=r" (iBit)
7495 : "rm" (u32)
7496 : "cc");
7497# else
7498 uint32_t iBit;
7499 _asm
7500 {
7501 bsf eax, [u32]
7502 jnz found
7503 xor eax, eax
7504 jmp done
7505 found:
7506 inc eax
7507 done:
7508 mov [iBit], eax
7509 }
7510# endif
7511
7512# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7513 /*
7514 * Using the "count leading zeros (clz)" instruction here because there
7515 * is no dedicated instruction to get the first set bit.
7516 * Need to reverse the bits in the value with "rbit" first because
7517 * "clz" starts counting from the most significant bit.
7518 */
7519 uint32_t iBit;
7520 __asm__ __volatile__(
7521# if defined(RT_ARCH_ARM64)
7522 "rbit %w[uVal], %w[uVal]\n\t"
7523 "clz %w[iBit], %w[uVal]\n\t"
7524# else
7525 "rbit %[uVal], %[uVal]\n\t"
7526 "clz %[iBit], %[uVal]\n\t"
7527# endif
7528 : [uVal] "=r" (u32)
7529 , [iBit] "=r" (iBit)
7530 : "[uVal]" (u32));
7531 if (iBit != 32)
7532 iBit++;
7533 else
7534 iBit = 0; /* No bit set. */
7535
7536# else
7537# error "Port me"
7538# endif
7539 return iBit;
7540}
7541#endif
7542
7543
7544/**
7545 * Finds the first bit which is set in the given 32-bit integer.
7546 * Bits are numbered from 1 (least significant) to 32.
7547 *
7548 * @returns index [1..32] of the first set bit.
7549 * @returns 0 if all bits are cleared.
7550 * @param i32 Integer to search for set bits.
7551 * @remark Similar to ffs() in BSD.
7552 */
7553DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
7554{
7555 return ASMBitFirstSetU32((uint32_t)i32);
7556}
7557
7558
7559/**
7560 * Finds the first bit which is set in the given 64-bit integer.
7561 *
7562 * Bits are numbered from 1 (least significant) to 64.
7563 *
7564 * @returns index [1..64] of the first set bit.
7565 * @returns 0 if all bits are cleared.
7566 * @param u64 Integer to search for set bits.
7567 * @remarks Similar to ffs() in BSD.
7568 */
7569#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7570RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
7571#else
7572DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
7573{
7574# if RT_INLINE_ASM_USES_INTRIN
7575 unsigned long iBit;
7576# if ARCH_BITS == 64
7577 if (_BitScanForward64(&iBit, u64))
7578 iBit++;
7579 else
7580 iBit = 0;
7581# else
7582 if (_BitScanForward(&iBit, (uint32_t)u64))
7583 iBit++;
7584 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
7585 iBit += 33;
7586 else
7587 iBit = 0;
7588# endif
7589
7590# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7591 uint64_t iBit;
7592 __asm__ __volatile__("bsfq %1, %0\n\t"
7593 "jnz 1f\n\t"
7594 "xorl %k0, %k0\n\t"
7595 "jmp 2f\n"
7596 "1:\n\t"
7597 "incl %k0\n"
7598 "2:\n\t"
7599 : "=r" (iBit)
7600 : "rm" (u64)
7601 : "cc");
7602
7603# elif defined(RT_ARCH_ARM64)
7604 uint64_t iBit;
7605 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
7606 "clz %[iBit], %[uVal]\n\t"
7607 : [uVal] "=r" (u64)
7608 , [iBit] "=r" (iBit)
7609 : "[uVal]" (u64));
7610 if (iBit != 64)
7611 iBit++;
7612 else
7613 iBit = 0; /* No bit set. */
7614
7615# else
7616 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
7617 if (!iBit)
7618 {
7619 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
7620 if (iBit)
7621 iBit += 32;
7622 }
7623# endif
7624 return (unsigned)iBit;
7625}
7626#endif
7627
7628
7629/**
7630 * Finds the first bit which is set in the given 16-bit integer.
7631 *
7632 * Bits are numbered from 1 (least significant) to 16.
7633 *
7634 * @returns index [1..16] of the first set bit.
7635 * @returns 0 if all bits are cleared.
7636 * @param u16 Integer to search for set bits.
7637 * @remarks For 16-bit bs3kit code.
7638 */
7639#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7640RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
7641#else
7642DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
7643{
7644 return ASMBitFirstSetU32((uint32_t)u16);
7645}
7646#endif
7647
7648
7649/**
7650 * Finds the last bit which is set in the given 32-bit integer.
7651 * Bits are numbered from 1 (least significant) to 32.
7652 *
7653 * @returns index [1..32] of the last set bit.
7654 * @returns 0 if all bits are cleared.
7655 * @param u32 Integer to search for set bits.
7656 * @remark Similar to fls() in BSD.
7657 */
7658#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7659RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7660#else
7661DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
7662{
7663# if RT_INLINE_ASM_USES_INTRIN
7664 unsigned long iBit;
7665 if (_BitScanReverse(&iBit, u32))
7666 iBit++;
7667 else
7668 iBit = 0;
7669
7670# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7671# if RT_INLINE_ASM_GNU_STYLE
7672 uint32_t iBit;
7673 __asm__ __volatile__("bsrl %1, %0\n\t"
7674 "jnz 1f\n\t"
7675 "xorl %0, %0\n\t"
7676 "jmp 2f\n"
7677 "1:\n\t"
7678 "incl %0\n"
7679 "2:\n\t"
7680 : "=r" (iBit)
7681 : "rm" (u32)
7682 : "cc");
7683# else
7684 uint32_t iBit;
7685 _asm
7686 {
7687 bsr eax, [u32]
7688 jnz found
7689 xor eax, eax
7690 jmp done
7691 found:
7692 inc eax
7693 done:
7694 mov [iBit], eax
7695 }
7696# endif
7697
7698# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7699 uint32_t iBit;
7700 __asm__ __volatile__(
7701# if defined(RT_ARCH_ARM64)
7702 "clz %w[iBit], %w[uVal]\n\t"
7703# else
7704 "clz %[iBit], %[uVal]\n\t"
7705# endif
7706 : [iBit] "=r" (iBit)
7707 : [uVal] "r" (u32));
7708 iBit = 32 - iBit;
7709
7710# else
7711# error "Port me"
7712# endif
7713 return iBit;
7714}
7715#endif
7716
7717
7718/**
7719 * Finds the last bit which is set in the given 32-bit integer.
7720 * Bits are numbered from 1 (least significant) to 32.
7721 *
7722 * @returns index [1..32] of the last set bit.
7723 * @returns 0 if all bits are cleared.
7724 * @param i32 Integer to search for set bits.
7725 * @remark Similar to fls() in BSD.
7726 */
7727DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
7728{
7729 return ASMBitLastSetU32((uint32_t)i32);
7730}
7731
7732
7733/**
7734 * Finds the last bit which is set in the given 64-bit integer.
7735 *
7736 * Bits are numbered from 1 (least significant) to 64.
7737 *
7738 * @returns index [1..64] of the last set bit.
7739 * @returns 0 if all bits are cleared.
7740 * @param u64 Integer to search for set bits.
7741 * @remark Similar to fls() in BSD.
7742 */
7743#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7744RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
7745#else
7746DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
7747{
7748# if RT_INLINE_ASM_USES_INTRIN
7749 unsigned long iBit;
7750# if ARCH_BITS == 64
7751 if (_BitScanReverse64(&iBit, u64))
7752 iBit++;
7753 else
7754 iBit = 0;
7755# else
7756 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
7757 iBit += 33;
7758 else if (_BitScanReverse(&iBit, (uint32_t)u64))
7759 iBit++;
7760 else
7761 iBit = 0;
7762# endif
7763
7764# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7765 uint64_t iBit;
7766 __asm__ __volatile__("bsrq %1, %0\n\t"
7767 "jnz 1f\n\t"
7768 "xorl %k0, %k0\n\t"
7769 "jmp 2f\n"
7770 "1:\n\t"
7771 "incl %k0\n"
7772 "2:\n\t"
7773 : "=r" (iBit)
7774 : "rm" (u64)
7775 : "cc");
7776
7777# elif defined(RT_ARCH_ARM64)
7778 uint64_t iBit;
7779 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
7780 : [iBit] "=r" (iBit)
7781 : [uVal] "r" (u64));
7782 iBit = 64 - iBit;
7783
7784# else
7785 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
7786 if (iBit)
7787 iBit += 32;
7788 else
7789 iBit = ASMBitLastSetU32((uint32_t)u64);
7790# endif
7791 return (unsigned)iBit;
7792}
7793#endif
7794
7795
7796/**
7797 * Finds the last bit which is set in the given 16-bit integer.
7798 *
7799 * Bits are numbered from 1 (least significant) to 16.
7800 *
7801 * @returns index [1..16] of the last set bit.
7802 * @returns 0 if all bits are cleared.
7803 * @param u16 Integer to search for set bits.
7804 * @remarks For 16-bit bs3kit code.
7805 */
7806#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7807RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
7808#else
7809DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
7810{
7811 return ASMBitLastSetU32((uint32_t)u16);
7812}
7813#endif
7814
7815
7816/**
7817 * Count the number of leading zero bits in the given 32-bit integer.
7818 *
7819 * The counting starts with the most significate bit.
7820 *
7821 * @returns Number of most significant zero bits.
7822 * @returns 32 if all bits are cleared.
7823 * @param u32 Integer to consider.
7824 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
7825 */
7826#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7827RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
7828#else
7829DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
7830{
7831# if RT_INLINE_ASM_USES_INTRIN
7832 unsigned long iBit;
7833 if (!_BitScanReverse(&iBit, u32))
7834 return 32;
7835 return 31 - (unsigned)iBit;
7836
7837# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7838 uint32_t iBit;
7839# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
7840 __asm__ __volatile__("bsrl %1, %0\n\t"
7841 "cmovzl %2, %0\n\t"
7842 : "=&r" (iBit)
7843 : "rm" (u32)
7844 , "rm" ((int32_t)-1)
7845 : "cc");
7846# elif RT_INLINE_ASM_GNU_STYLE
7847 __asm__ __volatile__("bsr %1, %0\n\t"
7848 "jnz 1f\n\t"
7849 "mov $-1, %0\n\t"
7850 "1:\n\t"
7851 : "=r" (iBit)
7852 : "rm" (u32)
7853 : "cc");
7854# else
7855 _asm
7856 {
7857 bsr eax, [u32]
7858 jnz found
7859 mov eax, -1
7860 found:
7861 mov [iBit], eax
7862 }
7863# endif
7864 return 31 - iBit;
7865
7866# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7867 uint32_t iBit;
7868 __asm__ __volatile__(
7869# if defined(RT_ARCH_ARM64)
7870 "clz %w[iBit], %w[uVal]\n\t"
7871# else
7872 "clz %[iBit], %[uVal]\n\t"
7873# endif
7874 : [uVal] "=r" (u32)
7875 , [iBit] "=r" (iBit)
7876 : "[uVal]" (u32));
7877 return iBit;
7878
7879# elif defined(__GNUC__)
7880 AssertCompile(sizeof(u32) == sizeof(unsigned int));
7881 return u32 ? __builtin_clz(u32) : 32;
7882
7883# else
7884# error "Port me"
7885# endif
7886}
7887#endif
7888
7889
7890/**
7891 * Count the number of leading zero bits in the given 64-bit integer.
7892 *
7893 * The counting starts with the most significate bit.
7894 *
7895 * @returns Number of most significant zero bits.
7896 * @returns 64 if all bits are cleared.
7897 * @param u64 Integer to consider.
7898 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
7899 * result.
7900 */
7901#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7902RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
7903#else
7904DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
7905{
7906# if RT_INLINE_ASM_USES_INTRIN
7907 unsigned long iBit;
7908# if ARCH_BITS == 64
7909 if (_BitScanReverse64(&iBit, u64))
7910 return 63 - (unsigned)iBit;
7911# else
7912 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
7913 return 31 - (unsigned)iBit;
7914 if (_BitScanReverse(&iBit, (uint32_t)u64))
7915 return 63 - (unsigned)iBit;
7916# endif
7917 return 64;
7918
7919# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7920 uint64_t iBit;
7921# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
7922 __asm__ __volatile__("bsrq %1, %0\n\t"
7923 "cmovzq %2, %0\n\t"
7924 : "=&r" (iBit)
7925 : "rm" (u64)
7926 , "rm" ((int64_t)-1)
7927 : "cc");
7928# else /* 10980xe benchmark: 262 ps/call */
7929 __asm__ __volatile__("bsrq %1, %0\n\t"
7930 "jnz 1f\n\t"
7931 "mov $-1, %0\n\t"
7932 "1:\n\t"
7933 : "=&r" (iBit)
7934 : "rm" (u64)
7935 : "cc");
7936# endif
7937 return 63 - (unsigned)iBit;
7938
7939# elif defined(RT_ARCH_ARM64)
7940 uint64_t iBit;
7941 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
7942 : [uVal] "=r" (u64)
7943 , [iBit] "=r" (iBit)
7944 : "[uVal]" (u64));
7945 return (unsigned)iBit;
7946
7947# elif defined(__GNUC__) && ARCH_BITS == 64
7948 AssertCompile(sizeof(u64) == sizeof(unsigned long));
7949 return u64 ? __builtin_clzl(u64) : 64;
7950
7951# else
7952 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
7953 if (iBit == 32)
7954 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
7955 return iBit;
7956# endif
7957}
7958#endif
7959
7960
7961/**
7962 * Count the number of leading zero bits in the given 16-bit integer.
7963 *
7964 * The counting starts with the most significate bit.
7965 *
7966 * @returns Number of most significant zero bits.
7967 * @returns 16 if all bits are cleared.
7968 * @param u16 Integer to consider.
7969 */
7970#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7971RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
7972#else
7973DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
7974{
7975# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
7976 uint16_t iBit;
7977 __asm__ __volatile__("bsrw %1, %0\n\t"
7978 "jnz 1f\n\t"
7979 "mov $-1, %0\n\t"
7980 "1:\n\t"
7981 : "=r" (iBit)
7982 : "rm" (u16)
7983 : "cc");
7984 return 15 - (int16_t)iBit;
7985# else
7986 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
7987# endif
7988}
7989#endif
7990
7991
7992/**
7993 * Count the number of trailing zero bits in the given 32-bit integer.
7994 *
7995 * The counting starts with the least significate bit, i.e. the zero bit.
7996 *
7997 * @returns Number of least significant zero bits.
7998 * @returns 32 if all bits are cleared.
7999 * @param u32 Integer to consider.
8000 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8001 */
8002#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8003RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8004#else
8005DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8006{
8007# if RT_INLINE_ASM_USES_INTRIN
8008 unsigned long iBit;
8009 if (!_BitScanForward(&iBit, u32))
8010 return 32;
8011 return (unsigned)iBit;
8012
8013# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8014 uint32_t iBit;
8015# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8016 __asm__ __volatile__("bsfl %1, %0\n\t"
8017 "cmovzl %2, %0\n\t"
8018 : "=&r" (iBit)
8019 : "rm" (u32)
8020 , "rm" ((int32_t)32)
8021 : "cc");
8022# elif RT_INLINE_ASM_GNU_STYLE
8023 __asm__ __volatile__("bsfl %1, %0\n\t"
8024 "jnz 1f\n\t"
8025 "mov $32, %0\n\t"
8026 "1:\n\t"
8027 : "=r" (iBit)
8028 : "rm" (u32)
8029 : "cc");
8030# else
8031 _asm
8032 {
8033 bsf eax, [u32]
8034 jnz found
8035 mov eax, 32
8036 found:
8037 mov [iBit], eax
8038 }
8039# endif
8040 return iBit;
8041
8042# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8043 /* Invert the bits and use clz. */
8044 uint32_t iBit;
8045 __asm__ __volatile__(
8046# if defined(RT_ARCH_ARM64)
8047 "rbit %w[uVal], %w[uVal]\n\t"
8048 "clz %w[iBit], %w[uVal]\n\t"
8049# else
8050 "rbit %[uVal], %[uVal]\n\t"
8051 "clz %[iBit], %[uVal]\n\t"
8052# endif
8053 : [uVal] "=r" (u32)
8054 , [iBit] "=r" (iBit)
8055 : "[uVal]" (u32));
8056 return iBit;
8057
8058# elif defined(__GNUC__)
8059 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8060 return u32 ? __builtin_ctz(u32) : 32;
8061
8062# else
8063# error "Port me"
8064# endif
8065}
8066#endif
8067
8068
8069/**
8070 * Count the number of trailing zero bits in the given 64-bit integer.
8071 *
8072 * The counting starts with the least significate bit.
8073 *
8074 * @returns Number of least significant zero bits.
8075 * @returns 64 if all bits are cleared.
8076 * @param u64 Integer to consider.
8077 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8078 * result.
8079 */
8080#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8081RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8082#else
8083DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8084{
8085# if RT_INLINE_ASM_USES_INTRIN
8086 unsigned long iBit;
8087# if ARCH_BITS == 64
8088 if (_BitScanForward64(&iBit, u64))
8089 return (unsigned)iBit;
8090# else
8091 if (_BitScanForward(&iBit, (uint32_t)u64))
8092 return (unsigned)iBit;
8093 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8094 return (unsigned)iBit + 32;
8095# endif
8096 return 64;
8097
8098# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8099 uint64_t iBit;
8100# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8101 __asm__ __volatile__("bsfq %1, %0\n\t"
8102 "cmovzq %2, %0\n\t"
8103 : "=&r" (iBit)
8104 : "rm" (u64)
8105 , "rm" ((int64_t)64)
8106 : "cc");
8107# else /* 10980xe benchmark: 262 ps/call */
8108 __asm__ __volatile__("bsfq %1, %0\n\t"
8109 "jnz 1f\n\t"
8110 "mov $64, %0\n\t"
8111 "1:\n\t"
8112 : "=&r" (iBit)
8113 : "rm" (u64)
8114 : "cc");
8115# endif
8116 return (unsigned)iBit;
8117
8118# elif defined(RT_ARCH_ARM64)
8119 /* Invert the bits and use clz. */
8120 uint64_t iBit;
8121 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8122 "clz %[iBit], %[uVal]\n\t"
8123 : [uVal] "=r" (u64)
8124 , [iBit] "=r" (iBit)
8125 : "[uVal]" (u64));
8126 return (unsigned)iBit;
8127
8128# elif defined(__GNUC__) && ARCH_BITS == 64
8129 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8130 return u64 ? __builtin_ctzl(u64) : 64;
8131
8132# else
8133 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8134 if (iBit == 32)
8135 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8136 return iBit;
8137# endif
8138}
8139#endif
8140
8141
8142/**
8143 * Count the number of trailing zero bits in the given 16-bit integer.
8144 *
8145 * The counting starts with the most significate bit.
8146 *
8147 * @returns Number of most significant zero bits.
8148 * @returns 16 if all bits are cleared.
8149 * @param u16 Integer to consider.
8150 */
8151#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8152RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8153#else
8154DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8155{
8156# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8157 uint16_t iBit;
8158 __asm__ __volatile__("bsfw %1, %0\n\t"
8159 "jnz 1f\n\t"
8160 "mov $16, %0\n\t"
8161 "1:\n\t"
8162 : "=r" (iBit)
8163 : "rm" (u16)
8164 : "cc");
8165 return iBit;
8166# else
8167 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8168#endif
8169}
8170#endif
8171
8172
8173/**
8174 * Rotate 32-bit unsigned value to the left by @a cShift.
8175 *
8176 * @returns Rotated value.
8177 * @param u32 The value to rotate.
8178 * @param cShift How many bits to rotate by.
8179 */
8180#ifdef __WATCOMC__
8181RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8182#else
8183DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8184{
8185# if RT_INLINE_ASM_USES_INTRIN
8186 return _rotl(u32, cShift);
8187
8188# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8189 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8190 return u32;
8191
8192# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8193 __asm__ __volatile__(
8194# if defined(RT_ARCH_ARM64)
8195 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8196# else
8197 "ror %[uRet], %[uVal], %[cShift]\n\t"
8198# endif
8199 : [uRet] "=r" (u32)
8200 : [uVal] "[uRet]" (u32)
8201 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8202 return u32;
8203
8204# else
8205 cShift &= 31;
8206 return (u32 << cShift) | (u32 >> (32 - cShift));
8207# endif
8208}
8209#endif
8210
8211
8212/**
8213 * Rotate 32-bit unsigned value to the right by @a cShift.
8214 *
8215 * @returns Rotated value.
8216 * @param u32 The value to rotate.
8217 * @param cShift How many bits to rotate by.
8218 */
8219#ifdef __WATCOMC__
8220RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8221#else
8222DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8223{
8224# if RT_INLINE_ASM_USES_INTRIN
8225 return _rotr(u32, cShift);
8226
8227# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8228 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8229 return u32;
8230
8231# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8232 __asm__ __volatile__(
8233# if defined(RT_ARCH_ARM64)
8234 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8235# else
8236 "ror %[uRet], %[uVal], %[cShift]\n\t"
8237# endif
8238 : [uRet] "=r" (u32)
8239 : [uVal] "[uRet]" (u32)
8240 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8241 return u32;
8242
8243# else
8244 cShift &= 31;
8245 return (u32 >> cShift) | (u32 << (32 - cShift));
8246# endif
8247}
8248#endif
8249
8250
8251/**
8252 * Rotate 64-bit unsigned value to the left by @a cShift.
8253 *
8254 * @returns Rotated value.
8255 * @param u64 The value to rotate.
8256 * @param cShift How many bits to rotate by.
8257 */
8258DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8259{
8260#if RT_INLINE_ASM_USES_INTRIN
8261 return _rotl64(u64, cShift);
8262
8263#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8264 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8265 return u64;
8266
8267#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8268 uint32_t uSpill;
8269 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8270 "jz 1f\n\t"
8271 "xchgl %%eax, %%edx\n\t"
8272 "1:\n\t"
8273 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8274 "jz 2f\n\t"
8275 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8276 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8277 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8278 "2:\n\t" /* } */
8279 : "=A" (u64)
8280 , "=c" (cShift)
8281 , "=r" (uSpill)
8282 : "0" (u64)
8283 , "1" (cShift)
8284 : "cc");
8285 return u64;
8286
8287# elif defined(RT_ARCH_ARM64)
8288 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8289 : [uRet] "=r" (u64)
8290 : [uVal] "[uRet]" (u64)
8291 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8292 return u64;
8293
8294#else
8295 cShift &= 63;
8296 return (u64 << cShift) | (u64 >> (64 - cShift));
8297#endif
8298}
8299
8300
8301/**
8302 * Rotate 64-bit unsigned value to the right by @a cShift.
8303 *
8304 * @returns Rotated value.
8305 * @param u64 The value to rotate.
8306 * @param cShift How many bits to rotate by.
8307 */
8308DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8309{
8310#if RT_INLINE_ASM_USES_INTRIN
8311 return _rotr64(u64, cShift);
8312
8313#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8314 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8315 return u64;
8316
8317#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8318 uint32_t uSpill;
8319 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8320 "jz 1f\n\t"
8321 "xchgl %%eax, %%edx\n\t"
8322 "1:\n\t"
8323 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8324 "jz 2f\n\t"
8325 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8326 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
8327 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
8328 "2:\n\t" /* } */
8329 : "=A" (u64)
8330 , "=c" (cShift)
8331 , "=r" (uSpill)
8332 : "0" (u64)
8333 , "1" (cShift)
8334 : "cc");
8335 return u64;
8336
8337# elif defined(RT_ARCH_ARM64)
8338 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8339 : [uRet] "=r" (u64)
8340 : [uVal] "[uRet]" (u64)
8341 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
8342 return u64;
8343
8344#else
8345 cShift &= 63;
8346 return (u64 >> cShift) | (u64 << (64 - cShift));
8347#endif
8348}
8349
8350/** @} */
8351
8352
8353/** @} */
8354
8355/*
8356 * Include #pragma aux definitions for Watcom C/C++.
8357 */
8358#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
8359# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
8360# undef IPRT_INCLUDED_asm_watcom_x86_16_h
8361# include "asm-watcom-x86-16.h"
8362#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
8363# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
8364# undef IPRT_INCLUDED_asm_watcom_x86_32_h
8365# include "asm-watcom-x86-32.h"
8366#endif
8367
8368#endif /* !IPRT_INCLUDED_asm_h */
8369
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette