VirtualBox

source: vbox/trunk/include/iprt/asm.h

Last change on this file was 106061, checked in by vboxsync, 8 weeks ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 288.1 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# pragma intrinsic(__cpuid)
60# pragma intrinsic(__stosd)
61# pragma intrinsic(__stosw)
62# pragma intrinsic(__stosb)
63# pragma intrinsic(_BitScanForward)
64# pragma intrinsic(_BitScanReverse)
65# pragma intrinsic(_bittest)
66# pragma intrinsic(_bittestandset)
67# pragma intrinsic(_bittestandreset)
68# pragma intrinsic(_bittestandcomplement)
69# pragma intrinsic(_byteswap_ushort)
70# pragma intrinsic(_byteswap_ulong)
71# pragma intrinsic(_interlockedbittestandset)
72# pragma intrinsic(_interlockedbittestandreset)
73# pragma intrinsic(_InterlockedAnd)
74# pragma intrinsic(_InterlockedOr)
75# pragma intrinsic(_InterlockedXor)
76# pragma intrinsic(_InterlockedIncrement)
77# pragma intrinsic(_InterlockedDecrement)
78# pragma intrinsic(_InterlockedExchange)
79# pragma intrinsic(_InterlockedExchangeAdd)
80# pragma intrinsic(_InterlockedCompareExchange)
81# pragma intrinsic(_InterlockedCompareExchange8)
82# pragma intrinsic(_InterlockedCompareExchange16)
83# pragma intrinsic(_InterlockedCompareExchange64)
84# pragma intrinsic(_rotl)
85# pragma intrinsic(_rotr)
86# pragma intrinsic(_rotl64)
87# pragma intrinsic(_rotr64)
88# ifdef RT_ARCH_AMD64
89# pragma intrinsic(__stosq)
90# pragma intrinsic(_byteswap_uint64)
91# pragma intrinsic(_InterlockedCompareExchange128)
92# pragma intrinsic(_InterlockedExchange64)
93# pragma intrinsic(_InterlockedExchangeAdd64)
94# pragma intrinsic(_InterlockedAnd64)
95# pragma intrinsic(_InterlockedOr64)
96# pragma intrinsic(_InterlockedIncrement64)
97# pragma intrinsic(_InterlockedDecrement64)
98# endif
99#endif
100
101#if (defined(RT_ARCH_ARM64) && defined(RT_OS_DARWIN)) || defined(DOXYGEN_RUNNING)
102/** @def RTASM_ARM64_USE_FEAT_LSE
103 * Use instructions from the FEAT_LSE set to implement atomic operations,
104 * assuming that the host CPU always supports these. */
105# define RTASM_ARM64_USE_FEAT_LSE 1
106/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
107 * Set to use DMB w/o barrier in most places and rely on the acquire-release
108 * aspects to do the serializing. The assumption is that the tstRTInline
109 * benchmark may be skewing the results testing an unusual scenario. */
110# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
111#endif
112
113
114/*
115 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
116 */
117#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
118# include "asm-watcom-x86-16.h"
119#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
120# include "asm-watcom-x86-32.h"
121#endif
122
123
124/** @defgroup grp_rt_asm ASM - Assembly Routines
125 * @ingroup grp_rt
126 *
127 * @remarks The difference between ordered and unordered atomic operations are
128 * that the former will complete outstanding reads and writes before
129 * continuing while the latter doesn't make any promises about the
130 * order. Ordered operations doesn't, it seems, make any 100% promise
131 * wrt to whether the operation will complete before any subsequent
132 * memory access. (please, correct if wrong.)
133 *
134 * ASMAtomicSomething operations are all ordered, while
135 * ASMAtomicUoSomething are unordered (note the Uo).
136 *
137 * Please note that ordered operations does not necessarily imply a
138 * compiler (memory) barrier. The user has to use the
139 * ASMCompilerBarrier() macro when that is deemed necessary.
140 *
141 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
142 * to reorder or even optimize assembler instructions away. For
143 * instance, in the following code the second rdmsr instruction is
144 * optimized away because gcc treats that instruction as deterministic:
145 *
146 * @code
147 * static inline uint64_t rdmsr_low(int idx)
148 * {
149 * uint32_t low;
150 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
151 * }
152 * ...
153 * uint32_t msr1 = rdmsr_low(1);
154 * foo(msr1);
155 * msr1 = rdmsr_low(1);
156 * bar(msr1);
157 * @endcode
158 *
159 * The input parameter of rdmsr_low is the same for both calls and
160 * therefore gcc will use the result of the first call as input
161 * parameter for bar() as well. For rdmsr this is not acceptable as
162 * this instruction is _not_ deterministic. This applies to reading
163 * machine status information in general.
164 *
165 * @{
166 */
167
168
169/** @def RT_INLINE_ASM_GCC_4_3_X_X86
170 * Used to work around some 4.3.x register allocation issues in this version of
171 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
172 * definitely not for 5.x */
173#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
174# define RT_INLINE_ASM_GCC_4_3_X_X86 1
175#else
176# define RT_INLINE_ASM_GCC_4_3_X_X86 0
177#endif
178
179/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
180 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
181 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
182 * mode, x86.
183 *
184 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
185 * when in PIC mode on x86.
186 */
187#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
188# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
189# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
190# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
191# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
192# elif ( (defined(PIC) || defined(__PIC__)) \
193 && defined(RT_ARCH_X86) \
194 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
195 || defined(RT_OS_DARWIN)) )
196# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
197# else
198# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
199# endif
200#endif
201
202
203/*
204 * ARM is great fun.
205 */
206#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
207
208# define RTASM_ARM_NO_BARRIER
209# ifdef RT_ARCH_ARM64
210# define RTASM_ARM_NO_BARRIER_IN_REG
211# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
212# define RTASM_ARM_DSB_SY "dsb sy\n\t"
213# define RTASM_ARM_DSB_SY_IN_REG
214# define RTASM_ARM_DSB_SY_COMMA_IN_REG
215# define RTASM_ARM_DMB_SY "dmb sy\n\t"
216# define RTASM_ARM_DMB_SY_IN_REG
217# define RTASM_ARM_DMB_SY_COMMA_IN_REG
218# define RTASM_ARM_DMB_ST "dmb st\n\t"
219# define RTASM_ARM_DMB_ST_IN_REG
220# define RTASM_ARM_DMB_ST_COMMA_IN_REG
221# define RTASM_ARM_DMB_LD "dmb ld\n\t"
222# define RTASM_ARM_DMB_LD_IN_REG
223# define RTASM_ARM_DMB_LD_COMMA_IN_REG
224# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
225# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
226 uint32_t rcSpill; \
227 uint32_t u32NewRet; \
228 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
229 RTASM_ARM_##barrier_type /* before lable? */ \
230 "ldaxr %w[uNew], %[pMem]\n\t" \
231 modify64 \
232 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
233 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
234 : [pMem] "+Q" (*a_pu32Mem) \
235 , [uNew] "=&r" (u32NewRet) \
236 , [rc] "=&r" (rcSpill) \
237 : in_reg \
238 : "cc")
239# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
240 uint32_t rcSpill; \
241 uint32_t u32OldRet; \
242 uint32_t u32NewSpill; \
243 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
244 RTASM_ARM_##barrier_type /* before lable? */ \
245 "ldaxr %w[uOld], %[pMem]\n\t" \
246 modify64 \
247 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
248 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
249 : [pMem] "+Q" (*a_pu32Mem) \
250 , [uOld] "=&r" (u32OldRet) \
251 , [uNew] "=&r" (u32NewSpill) \
252 , [rc] "=&r" (rcSpill) \
253 : in_reg \
254 : "cc")
255# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
256 uint32_t rcSpill; \
257 uint64_t u64NewRet; \
258 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
259 RTASM_ARM_##barrier_type /* before lable? */ \
260 "ldaxr %[uNew], %[pMem]\n\t" \
261 modify64 \
262 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
263 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
264 : [pMem] "+Q" (*a_pu64Mem) \
265 , [uNew] "=&r" (u64NewRet) \
266 , [rc] "=&r" (rcSpill) \
267 : in_reg \
268 : "cc")
269# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
270 uint32_t rcSpill; \
271 uint64_t u64OldRet; \
272 uint64_t u64NewSpill; \
273 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
274 RTASM_ARM_##barrier_type /* before lable? */ \
275 "ldaxr %[uOld], %[pMem]\n\t" \
276 modify64 \
277 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
278 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
279 : [pMem] "+Q" (*a_pu64Mem) \
280 , [uOld] "=&r" (u64OldRet) \
281 , [uNew] "=&r" (u64NewSpill) \
282 , [rc] "=&r" (rcSpill) \
283 : in_reg \
284 : "cc")
285
286# else /* RT_ARCH_ARM32 */
287# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
288# if RT_ARCH_ARM32 >= 7
289# warning armv7
290# define RTASM_ARM_NO_BARRIER_IN_REG
291# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
292# define RTASM_ARM_DSB_SY "dsb sy\n\t"
293# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
294# define RTASM_ARM_DMB_SY "dmb sy\n\t"
295# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
296# define RTASM_ARM_DMB_ST "dmb st\n\t"
297# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
298# define RTASM_ARM_DMB_LD "dmb ld\n\t"
299# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
300
301# elif RT_ARCH_ARM32 >= 6
302# warning armv6
303# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
304# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
305# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
306# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
307# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
308# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
309# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
310# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
311
312# elif RT_ARCH_ARM32 >= 4
313# warning armv5 or older
314# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
315# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
316# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
317# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
318# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
319# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
320# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
321# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
322# else
323# error "huh? Odd RT_ARCH_ARM32 value!"
324# endif
325# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
326# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
327# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
328# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
329# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
330 uint32_t rcSpill; \
331 uint32_t u32NewRet; \
332 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
333 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
334 "ldrex %[uNew], %[pMem]\n\t" \
335 modify32 \
336 "strex %[rc], %[uNew], %[pMem]\n\t" \
337 "cmp %[rc], #0\n\t" \
338 "bne Ltry_again_" #name "_%=\n\t" \
339 : [pMem] "+m" (*a_pu32Mem) \
340 , [uNew] "=&r" (u32NewRet) \
341 , [rc] "=&r" (rcSpill) \
342 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
343 , in_reg \
344 : "cc")
345# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
346 uint32_t rcSpill; \
347 uint32_t u32OldRet; \
348 uint32_t u32NewSpill; \
349 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
350 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
351 "ldrex %[uOld], %[pMem]\n\t" \
352 modify32 \
353 "strex %[rc], %[uNew], %[pMem]\n\t" \
354 "cmp %[rc], #0\n\t" \
355 "bne Ltry_again_" #name "_%=\n\t" \
356 : [pMem] "+m" (*a_pu32Mem) \
357 , [uOld] "=&r" (u32OldRet) \
358 , [uNew] "=&r" (u32NewSpill) \
359 , [rc] "=&r" (rcSpill) \
360 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
361 , in_reg \
362 : "cc")
363# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
364 uint32_t rcSpill; \
365 uint64_t u64NewRet; \
366 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
367 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
368 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
369 modify32 \
370 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
371 "cmp %[rc], #0\n\t" \
372 "bne Ltry_again_" #name "_%=\n\t" \
373 : [pMem] "+m" (*a_pu64Mem), \
374 [uNew] "=&r" (u64NewRet), \
375 [rc] "=&r" (rcSpill) \
376 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
377 , in_reg \
378 : "cc")
379# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
380 uint32_t rcSpill; \
381 uint64_t u64OldRet; \
382 uint64_t u64NewSpill; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
386 modify32 \
387 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu64Mem), \
391 [uOld] "=&r" (u64OldRet), \
392 [uNew] "=&r" (u64NewSpill), \
393 [rc] "=&r" (rcSpill) \
394 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
395 , in_reg \
396 : "cc")
397# endif /* RT_ARCH_ARM32 */
398#endif
399
400
401/** @def ASMReturnAddress
402 * Gets the return address of the current (or calling if you like) function or method.
403 */
404#ifdef _MSC_VER
405# ifdef __cplusplus
406extern "C"
407# endif
408void * _ReturnAddress(void);
409# pragma intrinsic(_ReturnAddress)
410# define ASMReturnAddress() _ReturnAddress()
411#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
412# define ASMReturnAddress() __builtin_return_address(0)
413#elif defined(__WATCOMC__)
414# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
415#else
416# error "Unsupported compiler."
417#endif
418
419
420/**
421 * Compiler memory barrier.
422 *
423 * Ensure that the compiler does not use any cached (register/tmp stack) memory
424 * values or any outstanding writes when returning from this function.
425 *
426 * This function must be used if non-volatile data is modified by a
427 * device or the VMM. Typical cases are port access, MMIO access,
428 * trapping instruction, etc.
429 */
430#if RT_INLINE_ASM_GNU_STYLE
431# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
432#elif RT_INLINE_ASM_USES_INTRIN
433# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
434#elif defined(__WATCOMC__)
435void ASMCompilerBarrier(void);
436#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
437DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
438{
439 __asm
440 {
441 }
442}
443#endif
444
445
446/** @def ASMBreakpoint
447 * Debugger Breakpoint.
448 * @deprecated Use RT_BREAKPOINT instead.
449 * @internal
450 */
451#define ASMBreakpoint() RT_BREAKPOINT()
452
453
454/**
455 * Spinloop hint for platforms that have these, empty function on the other
456 * platforms.
457 *
458 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
459 * spin locks.
460 */
461#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
462RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
463#else
464DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
465{
466# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
467# if RT_INLINE_ASM_GNU_STYLE
468 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
469# else
470 __asm {
471 _emit 0f3h
472 _emit 090h
473 }
474# endif
475
476# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
477 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
478
479# else
480 /* dummy */
481# endif
482}
483#endif
484
485
486/**
487 * Atomically Exchange an unsigned 8-bit value, ordered.
488 *
489 * @returns Current *pu8 value
490 * @param pu8 Pointer to the 8-bit variable to update.
491 * @param u8 The 8-bit value to assign to *pu8.
492 */
493#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
494RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
495#else
496DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
497{
498# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
499# if RT_INLINE_ASM_GNU_STYLE
500 __asm__ __volatile__("xchgb %0, %1\n\t"
501 : "=m" (*pu8)
502 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
503 : "1" (u8)
504 , "m" (*pu8));
505# else
506 __asm
507 {
508# ifdef RT_ARCH_AMD64
509 mov rdx, [pu8]
510 mov al, [u8]
511 xchg [rdx], al
512 mov [u8], al
513# else
514 mov edx, [pu8]
515 mov al, [u8]
516 xchg [edx], al
517 mov [u8], al
518# endif
519 }
520# endif
521 return u8;
522
523# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
524 uint32_t uOld;
525# if defined(RTASM_ARM64_USE_FEAT_LSE)
526 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
527 have the barrier we shouldn't need that, right? Ordering should be taken
528 care of by the DMB. The SWPB is rather cheap (~70% faster). */
529 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
530# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
531 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
532# else
533 RTASM_ARM_DMB_SY
534 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
535# endif
536 : [pMem] "+Q" (*pu8)
537 , [uOld] "=&r" (uOld)
538 : [uNew] "r" ((uint32_t)u8)
539 : );
540# else
541 uint32_t rcSpill;
542 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
543 RTASM_ARM_DMB_SY
544# if defined(RT_ARCH_ARM64)
545 "ldaxrb %w[uOld], %[pMem]\n\t"
546 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
547 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
548# else
549 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
550 "strexb %[rc], %[uNew], %[pMem]\n\t"
551 "cmp %[rc], #0\n\t"
552 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
553# endif
554 : [pMem] "+Q" (*pu8)
555 , [uOld] "=&r" (uOld)
556 , [rc] "=&r" (rcSpill)
557 : [uNew] "r" ((uint32_t)u8)
558 RTASM_ARM_DMB_SY_COMMA_IN_REG
559 : "cc");
560# endif
561 return (uint8_t)uOld;
562
563# else
564# error "Port me"
565# endif
566}
567#endif
568
569
570/**
571 * Atomically Exchange a signed 8-bit value, ordered.
572 *
573 * @returns Current *pu8 value
574 * @param pi8 Pointer to the 8-bit variable to update.
575 * @param i8 The 8-bit value to assign to *pi8.
576 */
577DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
578{
579 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
580}
581
582
583/**
584 * Atomically Exchange a bool value, ordered.
585 *
586 * @returns Current *pf value
587 * @param pf Pointer to the 8-bit variable to update.
588 * @param f The 8-bit value to assign to *pi8.
589 */
590DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
591{
592#ifdef _MSC_VER
593 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
594#else
595 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
596#endif
597}
598
599
600/**
601 * Atomically Exchange an unsigned 16-bit value, ordered.
602 *
603 * @returns Current *pu16 value
604 * @param pu16 Pointer to the 16-bit variable to update.
605 * @param u16 The 16-bit value to assign to *pu16.
606 */
607#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
608RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
609#else
610DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
611{
612# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
613# if RT_INLINE_ASM_GNU_STYLE
614 __asm__ __volatile__("xchgw %0, %1\n\t"
615 : "=m" (*pu16)
616 , "=r" (u16)
617 : "1" (u16)
618 , "m" (*pu16));
619# else
620 __asm
621 {
622# ifdef RT_ARCH_AMD64
623 mov rdx, [pu16]
624 mov ax, [u16]
625 xchg [rdx], ax
626 mov [u16], ax
627# else
628 mov edx, [pu16]
629 mov ax, [u16]
630 xchg [edx], ax
631 mov [u16], ax
632# endif
633 }
634# endif
635 return u16;
636
637# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
638 uint32_t uOld;
639# if defined(RTASM_ARM64_USE_FEAT_LSE)
640 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
641 slower if we remove the barrier. But since we have the barrier we
642 shouldn't need that, right? Ordering should be taken care of by the DMB.
643 The SWPH is rather cheap (~70% faster). */
644 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
645# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
646 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
647# else
648 RTASM_ARM_DMB_SY
649 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
650# endif
651 : [pMem] "+Q" (*pu16)
652 , [uOld] "=&r" (uOld)
653 : [uNew] "r" ((uint32_t)u16)
654 : );
655# else
656 uint32_t rcSpill;
657 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
658 RTASM_ARM_DMB_SY
659# if defined(RT_ARCH_ARM64)
660 "ldaxrh %w[uOld], %[pMem]\n\t"
661 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
662 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
663# else
664 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
665 "strexh %[rc], %[uNew], %[pMem]\n\t"
666 "cmp %[rc], #0\n\t"
667 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
668# endif
669 : [pMem] "+Q" (*pu16)
670 , [uOld] "=&r" (uOld)
671 , [rc] "=&r" (rcSpill)
672 : [uNew] "r" ((uint32_t)u16)
673 RTASM_ARM_DMB_SY_COMMA_IN_REG
674 : "cc");
675# endif
676 return (uint16_t)uOld;
677
678# else
679# error "Port me"
680# endif
681}
682#endif
683
684
685/**
686 * Atomically Exchange a signed 16-bit value, ordered.
687 *
688 * @returns Current *pu16 value
689 * @param pi16 Pointer to the 16-bit variable to update.
690 * @param i16 The 16-bit value to assign to *pi16.
691 */
692DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
693{
694 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
695}
696
697
698/**
699 * Atomically Exchange an unsigned 32-bit value, ordered.
700 *
701 * @returns Current *pu32 value
702 * @param pu32 Pointer to the 32-bit variable to update.
703 * @param u32 The 32-bit value to assign to *pu32.
704 *
705 * @remarks Does not work on 286 and earlier.
706 */
707#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
708RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
709#else
710DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
711{
712# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
713# if RT_INLINE_ASM_GNU_STYLE
714 __asm__ __volatile__("xchgl %0, %1\n\t"
715 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
716 , "=r" (u32)
717 : "1" (u32)
718 , "m" (*pu32));
719
720# elif RT_INLINE_ASM_USES_INTRIN
721 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
722
723# else
724 __asm
725 {
726# ifdef RT_ARCH_AMD64
727 mov rdx, [pu32]
728 mov eax, u32
729 xchg [rdx], eax
730 mov [u32], eax
731# else
732 mov edx, [pu32]
733 mov eax, u32
734 xchg [edx], eax
735 mov [u32], eax
736# endif
737 }
738# endif
739 return u32;
740
741# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
742 uint32_t uOld;
743# if defined(RTASM_ARM64_USE_FEAT_LSE)
744 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
745 slower if we remove the barrier. But since we have the barrier we
746 shouldn't need that, right? Ordering should be taken care of by the DMB.
747 The SWP is rather cheap (~70% faster). */
748 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
749# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
750 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
751# else
752 RTASM_ARM_DMB_SY
753 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
754# endif
755 : [pMem] "+Q" (*pu32)
756 , [uOld] "=&r" (uOld)
757 : [uNew] "r" (u32)
758 : );
759# else
760 uint32_t rcSpill;
761 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
762 RTASM_ARM_DMB_SY
763# if defined(RT_ARCH_ARM64)
764 "ldaxr %w[uOld], %[pMem]\n\t"
765 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
766 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
767# else
768 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
769 "strex %[rc], %[uNew], %[pMem]\n\t"
770 "cmp %[rc], #0\n\t"
771 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
772# endif
773 : [pMem] "+Q" (*pu32)
774 , [uOld] "=&r" (uOld)
775 , [rc] "=&r" (rcSpill)
776 : [uNew] "r" (u32)
777 RTASM_ARM_DMB_SY_COMMA_IN_REG
778 : "cc");
779# endif
780 return uOld;
781
782# else
783# error "Port me"
784# endif
785}
786#endif
787
788
789/**
790 * Atomically Exchange a signed 32-bit value, ordered.
791 *
792 * @returns Current *pu32 value
793 * @param pi32 Pointer to the 32-bit variable to update.
794 * @param i32 The 32-bit value to assign to *pi32.
795 */
796DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
797{
798 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
799}
800
801
802/**
803 * Atomically Exchange an unsigned 64-bit value, ordered.
804 *
805 * @returns Current *pu64 value
806 * @param pu64 Pointer to the 64-bit variable to update.
807 * @param u64 The 64-bit value to assign to *pu64.
808 *
809 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
810 */
811#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
812 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
813RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
814#else
815DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
816{
817# if defined(RT_ARCH_AMD64)
818# if RT_INLINE_ASM_USES_INTRIN
819 return _InterlockedExchange64((__int64 *)pu64, u64);
820
821# elif RT_INLINE_ASM_GNU_STYLE
822 __asm__ __volatile__("xchgq %0, %1\n\t"
823 : "=m" (*pu64)
824 , "=r" (u64)
825 : "1" (u64)
826 , "m" (*pu64));
827 return u64;
828# else
829 __asm
830 {
831 mov rdx, [pu64]
832 mov rax, [u64]
833 xchg [rdx], rax
834 mov [u64], rax
835 }
836 return u64;
837# endif
838
839# elif defined(RT_ARCH_X86)
840# if RT_INLINE_ASM_GNU_STYLE
841# if defined(PIC) || defined(__PIC__)
842 uint32_t u32EBX = (uint32_t)u64;
843 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
844 "xchgl %%ebx, %3\n\t"
845 "1:\n\t"
846 "lock; cmpxchg8b (%5)\n\t"
847 "jnz 1b\n\t"
848 "movl %3, %%ebx\n\t"
849 /*"xchgl %%esi, %5\n\t"*/
850 : "=A" (u64)
851 , "=m" (*pu64)
852 : "0" (*pu64)
853 , "m" ( u32EBX )
854 , "c" ( (uint32_t)(u64 >> 32) )
855 , "S" (pu64)
856 : "cc");
857# else /* !PIC */
858 __asm__ __volatile__("1:\n\t"
859 "lock; cmpxchg8b %1\n\t"
860 "jnz 1b\n\t"
861 : "=A" (u64)
862 , "=m" (*pu64)
863 : "0" (*pu64)
864 , "b" ( (uint32_t)u64 )
865 , "c" ( (uint32_t)(u64 >> 32) )
866 : "cc");
867# endif
868# else
869 __asm
870 {
871 mov ebx, dword ptr [u64]
872 mov ecx, dword ptr [u64 + 4]
873 mov edi, pu64
874 mov eax, dword ptr [edi]
875 mov edx, dword ptr [edi + 4]
876 retry:
877 lock cmpxchg8b [edi]
878 jnz retry
879 mov dword ptr [u64], eax
880 mov dword ptr [u64 + 4], edx
881 }
882# endif
883 return u64;
884
885# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
886 uint64_t uOld;
887# if defined(RTASM_ARM64_USE_FEAT_LSE)
888 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
889 slower if we remove the barrier. But since we have the barrier we
890 shouldn't need that, right? Ordering should be taken care of by the DMB.
891 The SWP is rather cheap (~70% faster). */
892 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
893# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
894 "swpal %[uNew], %[uOld], %[pMem]\n\t"
895# else
896 RTASM_ARM_DMB_SY
897 "swp %[uNew], %[uOld], %[pMem]\n\t"
898# endif
899 : [pMem] "+Q" (*pu64)
900 , [uOld] "=&r" (uOld)
901 : [uNew] "r" (u64)
902 : );
903# else
904 uint32_t rcSpill;
905 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
906 RTASM_ARM_DMB_SY
907# if defined(RT_ARCH_ARM64)
908 "ldaxr %[uOld], %[pMem]\n\t"
909 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
910 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
911# else
912 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
913 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
914 "cmp %[rc], #0\n\t"
915 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
916# endif
917 : [pMem] "+Q" (*pu64)
918 , [uOld] "=&r" (uOld)
919 , [rc] "=&r" (rcSpill)
920 : [uNew] "r" (u64)
921 RTASM_ARM_DMB_SY_COMMA_IN_REG
922 : "cc");
923# endif
924 return uOld;
925
926# else
927# error "Port me"
928# endif
929}
930#endif
931
932
933/**
934 * Atomically Exchange an signed 64-bit value, ordered.
935 *
936 * @returns Current *pi64 value
937 * @param pi64 Pointer to the 64-bit variable to update.
938 * @param i64 The 64-bit value to assign to *pi64.
939 */
940DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
941{
942 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
943}
944
945
946/**
947 * Atomically Exchange a size_t value, ordered.
948 *
949 * @returns Current *ppv value
950 * @param puDst Pointer to the size_t variable to update.
951 * @param uNew The new value to assign to *puDst.
952 */
953DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
954{
955#if ARCH_BITS == 16
956 AssertCompile(sizeof(size_t) == 2);
957 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
958#elif ARCH_BITS == 32
959 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
960#elif ARCH_BITS == 64
961 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
962#else
963# error "ARCH_BITS is bogus"
964#endif
965}
966
967
968/**
969 * Atomically Exchange a pointer value, ordered.
970 *
971 * @returns Current *ppv value
972 * @param ppv Pointer to the pointer variable to update.
973 * @param pv The pointer value to assign to *ppv.
974 */
975DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
976{
977#if ARCH_BITS == 32 || ARCH_BITS == 16
978 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
979#elif ARCH_BITS == 64
980 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
981#else
982# error "ARCH_BITS is bogus"
983#endif
984}
985
986
987/**
988 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
989 *
990 * @returns Current *pv value
991 * @param ppv Pointer to the pointer variable to update.
992 * @param pv The pointer value to assign to *ppv.
993 * @param Type The type of *ppv, sans volatile.
994 */
995#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
996# define ASMAtomicXchgPtrT(ppv, pv, Type) \
997 __extension__ \
998 ({\
999 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1000 Type const pvTypeChecked = (pv); \
1001 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1002 pvTypeCheckedRet; \
1003 })
1004#else
1005# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1006 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1007#endif
1008
1009
1010/**
1011 * Atomically Exchange a raw-mode context pointer value, ordered.
1012 *
1013 * @returns Current *ppv value
1014 * @param ppvRC Pointer to the pointer variable to update.
1015 * @param pvRC The pointer value to assign to *ppv.
1016 */
1017DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1018{
1019 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1020}
1021
1022
1023/**
1024 * Atomically Exchange a ring-0 pointer value, ordered.
1025 *
1026 * @returns Current *ppv value
1027 * @param ppvR0 Pointer to the pointer variable to update.
1028 * @param pvR0 The pointer value to assign to *ppv.
1029 */
1030DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1031{
1032#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1033 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1034#elif R0_ARCH_BITS == 64
1035 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1036#else
1037# error "R0_ARCH_BITS is bogus"
1038#endif
1039}
1040
1041
1042/**
1043 * Atomically Exchange a ring-3 pointer value, ordered.
1044 *
1045 * @returns Current *ppv value
1046 * @param ppvR3 Pointer to the pointer variable to update.
1047 * @param pvR3 The pointer value to assign to *ppv.
1048 */
1049DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1050{
1051#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1052 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1053#elif R3_ARCH_BITS == 64
1054 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1055#else
1056# error "R3_ARCH_BITS is bogus"
1057#endif
1058}
1059
1060
1061/** @def ASMAtomicXchgHandle
1062 * Atomically Exchange a typical IPRT handle value, ordered.
1063 *
1064 * @param ph Pointer to the value to update.
1065 * @param hNew The new value to assigned to *pu.
1066 * @param phRes Where to store the current *ph value.
1067 *
1068 * @remarks This doesn't currently work for all handles (like RTFILE).
1069 */
1070#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1071# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1072 do { \
1073 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1074 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1075 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1076 } while (0)
1077#elif HC_ARCH_BITS == 64
1078# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1079 do { \
1080 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1081 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1082 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1083 } while (0)
1084#else
1085# error HC_ARCH_BITS
1086#endif
1087
1088
1089/**
1090 * Atomically Exchange a value which size might differ
1091 * between platforms or compilers, ordered.
1092 *
1093 * @param pu Pointer to the variable to update.
1094 * @param uNew The value to assign to *pu.
1095 * @todo This is busted as its missing the result argument.
1096 */
1097#define ASMAtomicXchgSize(pu, uNew) \
1098 do { \
1099 switch (sizeof(*(pu))) { \
1100 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1101 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1102 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1103 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1104 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1105 } \
1106 } while (0)
1107
1108/**
1109 * Atomically Exchange a value which size might differ
1110 * between platforms or compilers, ordered.
1111 *
1112 * @param pu Pointer to the variable to update.
1113 * @param uNew The value to assign to *pu.
1114 * @param puRes Where to store the current *pu value.
1115 */
1116#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1117 do { \
1118 switch (sizeof(*(pu))) { \
1119 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1120 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1121 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1122 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1123 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1124 } \
1125 } while (0)
1126
1127
1128
1129/**
1130 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1131 *
1132 * @returns true if xchg was done.
1133 * @returns false if xchg wasn't done.
1134 *
1135 * @param pu8 Pointer to the value to update.
1136 * @param u8New The new value to assigned to *pu8.
1137 * @param u8Old The old value to *pu8 compare with.
1138 *
1139 * @remarks x86: Requires a 486 or later.
1140 * @todo Rename ASMAtomicCmpWriteU8
1141 */
1142#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || !RT_INLINE_ASM_GNU_STYLE
1143RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1144#else
1145DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1146{
1147# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1148 uint8_t u8Ret;
1149 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1150 "setz %1\n\t"
1151 : "=m" (*pu8)
1152 , "=qm" (u8Ret)
1153 , "=a" (u8Old)
1154 : "q" (u8New)
1155 , "2" (u8Old)
1156 , "m" (*pu8)
1157 : "cc");
1158 return (bool)u8Ret;
1159
1160# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1161 union { uint32_t u; bool f; } fXchg;
1162 uint32_t u32Spill;
1163# if defined(RTASM_ARM64_USE_FEAT_LSE)
1164 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1165# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1166 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1167# else
1168 RTASM_ARM_DMB_SY
1169 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1170# endif
1171 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1172 "cset %w[fXchg], eq\n\t"
1173 : [pMem] "+Q" (*pu8)
1174 , [uOldActual] "=&r" (u32Spill)
1175 , [fXchg] "=&r" (fXchg.u)
1176 : [uNew] "r" ((uint32_t)u8New)
1177 , [uOldOrg] "r" ((uint32_t)u8Old)
1178 , "[uOldActual]" ((uint32_t)u8Old)
1179 : "cc");
1180# else
1181 uint32_t rcSpill;
1182 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1183 RTASM_ARM_DMB_SY
1184# if defined(RT_ARCH_ARM64)
1185 "ldaxrb %w[uOld], %[pMem]\n\t"
1186 "cmp %w[uOld], %w[uCmp]\n\t"
1187 "bne 1f\n\t" /* stop here if not equal */
1188 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1189 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1190 "mov %w[fXchg], #1\n\t"
1191 "1:\n\t"
1192 "clrex\n\t"
1193# else
1194 "ldrexb %[uOld], %[pMem]\n\t"
1195 "teq %[uOld], %[uCmp]\n\t"
1196 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1197 "bne 1f\n\t" /* stop here if not equal */
1198 "cmp %[rc], #0\n\t"
1199 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1200 "mov %[fXchg], #1\n\t"
1201 "1:\n\t"
1202 /** @todo clrexne on armv7? */
1203# endif
1204 : [pMem] "+Q" (*pu8)
1205 , [uOld] "=&r" (u32Spill)
1206 , [rc] "=&r" (rcSpill)
1207 , [fXchg] "=&r" (fXchg.u)
1208 : [uCmp] "r" ((uint32_t)u8Old)
1209 , [uNew] "r" ((uint32_t)u8New)
1210 , "[fXchg]" (0)
1211 RTASM_ARM_DMB_SY_COMMA_IN_REG
1212 : "cc");
1213# endif
1214 return fXchg.f;
1215
1216# else
1217# error "Port me"
1218# endif
1219}
1220#endif
1221
1222
1223/**
1224 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1225 *
1226 * @returns true if xchg was done.
1227 * @returns false if xchg wasn't done.
1228 *
1229 * @param pi8 Pointer to the value to update.
1230 * @param i8New The new value to assigned to *pi8.
1231 * @param i8Old The old value to *pi8 compare with.
1232 *
1233 * @remarks x86: Requires a 486 or later.
1234 * @todo Rename ASMAtomicCmpWriteS8
1235 */
1236DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1237{
1238 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1239}
1240
1241
1242/**
1243 * Atomically Compare and Exchange a bool value, ordered.
1244 *
1245 * @returns true if xchg was done.
1246 * @returns false if xchg wasn't done.
1247 *
1248 * @param pf Pointer to the value to update.
1249 * @param fNew The new value to assigned to *pf.
1250 * @param fOld The old value to *pf compare with.
1251 *
1252 * @remarks x86: Requires a 486 or later.
1253 * @todo Rename ASMAtomicCmpWriteBool
1254 */
1255DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1256{
1257 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1258}
1259
1260
1261/**
1262 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1263 *
1264 * @returns true if xchg was done.
1265 * @returns false if xchg wasn't done.
1266 *
1267 * @param pu32 Pointer to the value to update.
1268 * @param u32New The new value to assigned to *pu32.
1269 * @param u32Old The old value to *pu32 compare with.
1270 *
1271 * @remarks x86: Requires a 486 or later.
1272 * @todo Rename ASMAtomicCmpWriteU32
1273 */
1274#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1275RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1276#else
1277DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1278{
1279# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1280# if RT_INLINE_ASM_GNU_STYLE
1281 uint8_t u8Ret;
1282 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1283 "setz %1\n\t"
1284 : "=m" (*pu32)
1285 , "=qm" (u8Ret)
1286 , "=a" (u32Old)
1287 : "r" (u32New)
1288 , "2" (u32Old)
1289 , "m" (*pu32)
1290 : "cc");
1291 return (bool)u8Ret;
1292
1293# elif RT_INLINE_ASM_USES_INTRIN
1294 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1295
1296# else
1297 uint32_t u32Ret;
1298 __asm
1299 {
1300# ifdef RT_ARCH_AMD64
1301 mov rdx, [pu32]
1302# else
1303 mov edx, [pu32]
1304# endif
1305 mov eax, [u32Old]
1306 mov ecx, [u32New]
1307# ifdef RT_ARCH_AMD64
1308 lock cmpxchg [rdx], ecx
1309# else
1310 lock cmpxchg [edx], ecx
1311# endif
1312 setz al
1313 movzx eax, al
1314 mov [u32Ret], eax
1315 }
1316 return !!u32Ret;
1317# endif
1318
1319# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1320 union { uint32_t u; bool f; } fXchg;
1321 uint32_t u32Spill;
1322 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1323 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1324# if defined(RTASM_ARM64_USE_FEAT_LSE)
1325 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1326# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1327 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1328# else
1329 RTASM_ARM_DMB_SY
1330 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1331# endif
1332 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1333 "cset %w[fXchg], eq\n\t"
1334 : [pMem] "+Q" (*pu32)
1335 , [uOldActual] "=&r" (u32Spill)
1336 , [fXchg] "=&r" (fXchg.u)
1337 : [uNew] "r" (u32New)
1338 , [uOldOrg] "r" (u32Old)
1339 , "[uOldActual]" (u32Old)
1340 : "cc");
1341# else
1342 uint32_t rcSpill;
1343 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1344 RTASM_ARM_DMB_SY
1345# if defined(RT_ARCH_ARM64)
1346 "ldaxr %w[uOld], %[pMem]\n\t"
1347 "cmp %w[uOld], %w[uCmp]\n\t"
1348 "bne 1f\n\t" /* stop here if not equal */
1349 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1350 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1351 "mov %w[fXchg], #1\n\t"
1352 "1:\n\t"
1353 "clrex\n\t"
1354# else
1355 "ldrex %[uOld], %[pMem]\n\t"
1356 "teq %[uOld], %[uCmp]\n\t"
1357 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1358 "bne 1f\n\t" /* stop here if not equal */
1359 "cmp %[rc], #0\n\t"
1360 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1361 "mov %[fXchg], #1\n\t"
1362 "1:\n\t"
1363 /** @todo clrexne on armv7? */
1364# endif
1365 : [pMem] "+Q" (*pu32)
1366 , [uOld] "=&r" (u32Spill)
1367 , [rc] "=&r" (rcSpill)
1368 , [fXchg] "=&r" (fXchg.u)
1369 : [uCmp] "r" (u32Old)
1370 , [uNew] "r" (u32New)
1371 , "[fXchg]" (0)
1372 RTASM_ARM_DMB_SY_COMMA_IN_REG
1373 : "cc");
1374# endif
1375 return fXchg.f;
1376
1377# else
1378# error "Port me"
1379# endif
1380}
1381#endif
1382
1383
1384/**
1385 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1386 *
1387 * @returns true if xchg was done.
1388 * @returns false if xchg wasn't done.
1389 *
1390 * @param pi32 Pointer to the value to update.
1391 * @param i32New The new value to assigned to *pi32.
1392 * @param i32Old The old value to *pi32 compare with.
1393 *
1394 * @remarks x86: Requires a 486 or later.
1395 * @todo Rename ASMAtomicCmpWriteS32
1396 */
1397DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1398{
1399 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1400}
1401
1402
1403/**
1404 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1405 *
1406 * @returns true if xchg was done.
1407 * @returns false if xchg wasn't done.
1408 *
1409 * @param pu64 Pointer to the 64-bit variable to update.
1410 * @param u64New The 64-bit value to assign to *pu64.
1411 * @param u64Old The value to compare with.
1412 *
1413 * @remarks x86: Requires a Pentium or later.
1414 * @todo Rename ASMAtomicCmpWriteU64
1415 */
1416#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1417 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1418RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1419#else
1420DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1421{
1422# if RT_INLINE_ASM_USES_INTRIN
1423 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1424
1425# elif defined(RT_ARCH_AMD64)
1426# if RT_INLINE_ASM_GNU_STYLE
1427 uint8_t u8Ret;
1428 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1429 "setz %1\n\t"
1430 : "=m" (*pu64)
1431 , "=qm" (u8Ret)
1432 , "=a" (u64Old)
1433 : "r" (u64New)
1434 , "2" (u64Old)
1435 , "m" (*pu64)
1436 : "cc");
1437 return (bool)u8Ret;
1438# else
1439 bool fRet;
1440 __asm
1441 {
1442 mov rdx, [pu32]
1443 mov rax, [u64Old]
1444 mov rcx, [u64New]
1445 lock cmpxchg [rdx], rcx
1446 setz al
1447 mov [fRet], al
1448 }
1449 return fRet;
1450# endif
1451
1452# elif defined(RT_ARCH_X86)
1453 uint32_t u32Ret;
1454# if RT_INLINE_ASM_GNU_STYLE
1455# if defined(PIC) || defined(__PIC__)
1456 uint32_t u32EBX = (uint32_t)u64New;
1457 uint32_t u32Spill;
1458 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1459 "lock; cmpxchg8b (%6)\n\t"
1460 "setz %%al\n\t"
1461 "movl %4, %%ebx\n\t"
1462 "movzbl %%al, %%eax\n\t"
1463 : "=a" (u32Ret)
1464 , "=d" (u32Spill)
1465# if RT_GNUC_PREREQ(4, 3)
1466 , "+m" (*pu64)
1467# else
1468 , "=m" (*pu64)
1469# endif
1470 : "A" (u64Old)
1471 , "m" ( u32EBX )
1472 , "c" ( (uint32_t)(u64New >> 32) )
1473 , "S" (pu64)
1474 : "cc");
1475# else /* !PIC */
1476 uint32_t u32Spill;
1477 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1478 "setz %%al\n\t"
1479 "movzbl %%al, %%eax\n\t"
1480 : "=a" (u32Ret)
1481 , "=d" (u32Spill)
1482 , "+m" (*pu64)
1483 : "A" (u64Old)
1484 , "b" ( (uint32_t)u64New )
1485 , "c" ( (uint32_t)(u64New >> 32) )
1486 : "cc");
1487# endif
1488 return (bool)u32Ret;
1489# else
1490 __asm
1491 {
1492 mov ebx, dword ptr [u64New]
1493 mov ecx, dword ptr [u64New + 4]
1494 mov edi, [pu64]
1495 mov eax, dword ptr [u64Old]
1496 mov edx, dword ptr [u64Old + 4]
1497 lock cmpxchg8b [edi]
1498 setz al
1499 movzx eax, al
1500 mov dword ptr [u32Ret], eax
1501 }
1502 return !!u32Ret;
1503# endif
1504
1505# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1506 union { uint32_t u; bool f; } fXchg;
1507 uint64_t u64Spill;
1508 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1509 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1510# if defined(RTASM_ARM64_USE_FEAT_LSE)
1511 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1512# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1513 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1514# else
1515 RTASM_ARM_DMB_SY
1516 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1517# endif
1518 "cmp %[uOldActual], %[uOldOrg]\n\t"
1519 "cset %w[fXchg], eq\n\t"
1520 : [pMem] "+Q" (*pu64)
1521 , [uOldActual] "=&r" (u64Spill)
1522 , [fXchg] "=&r" (fXchg.u)
1523 : [uNew] "r" (u64New)
1524 , [uOldOrg] "r" (u64Old)
1525 , "[uOldActual]" (u64Old)
1526 : "cc");
1527# else
1528 uint32_t rcSpill;
1529 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1530 RTASM_ARM_DMB_SY
1531# if defined(RT_ARCH_ARM64)
1532 "ldaxr %[uOld], %[pMem]\n\t"
1533 "cmp %[uOld], %[uCmp]\n\t"
1534 "bne 1f\n\t" /* stop here if not equal */
1535 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1536 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1537 "mov %w[fXchg], #1\n\t"
1538 "1:\n\t"
1539 "clrex\n\t"
1540# else
1541 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1542 "teq %[uOld], %[uCmp]\n\t"
1543 "teqeq %H[uOld], %H[uCmp]\n\t"
1544 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1545 "bne 1f\n\t" /* stop here if not equal */
1546 "cmp %[rc], #0\n\t"
1547 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1548 "mov %[fXchg], #1\n\t"
1549 "1:\n\t"
1550 /** @todo clrexne on armv7? */
1551# endif
1552 : [pMem] "+Q" (*pu64)
1553 , [uOld] "=&r" (u64Spill)
1554 , [rc] "=&r" (rcSpill)
1555 , [fXchg] "=&r" (fXchg.u)
1556 : [uCmp] "r" (u64Old)
1557 , [uNew] "r" (u64New)
1558 , "[fXchg]" (0)
1559 RTASM_ARM_DMB_SY_COMMA_IN_REG
1560 : "cc");
1561# endif
1562 return fXchg.f;
1563
1564# else
1565# error "Port me"
1566# endif
1567}
1568#endif
1569
1570
1571/**
1572 * Atomically Compare and exchange a signed 64-bit value, ordered.
1573 *
1574 * @returns true if xchg was done.
1575 * @returns false if xchg wasn't done.
1576 *
1577 * @param pi64 Pointer to the 64-bit variable to update.
1578 * @param i64 The 64-bit value to assign to *pu64.
1579 * @param i64Old The value to compare with.
1580 *
1581 * @remarks x86: Requires a Pentium or later.
1582 * @todo Rename ASMAtomicCmpWriteS64
1583 */
1584DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1585{
1586 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1587}
1588
1589#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1590
1591/** @def RTASM_HAVE_CMP_WRITE_U128
1592 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1593 * and ASMAtomicCmpWriteExU128() available. */
1594# define RTASM_HAVE_CMP_WRITE_U128 1
1595
1596
1597/**
1598 * Atomically compare and write an unsigned 128-bit value, ordered.
1599 *
1600 * @returns true if write was done.
1601 * @returns false if write wasn't done.
1602 *
1603 * @param pu128 Pointer to the 128-bit variable to update.
1604 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1605 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1606 * @param u64OldHi The high 64-bit of the value to compare with.
1607 * @param u64OldLo The low 64-bit of the value to compare with.
1608 *
1609 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1610 */
1611# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1612DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1613 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1614# else
1615DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1616 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1617{
1618# if RT_INLINE_ASM_USES_INTRIN
1619 __int64 ai64Cmp[2];
1620 ai64Cmp[0] = u64OldLo;
1621 ai64Cmp[1] = u64OldHi;
1622 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1623
1624# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1625 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1626
1627# elif defined(RT_ARCH_AMD64)
1628# if RT_INLINE_ASM_GNU_STYLE
1629 uint64_t u64Ret;
1630 uint64_t u64Spill;
1631 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1632 "setz %%al\n\t"
1633 "movzbl %%al, %%eax\n\t"
1634 : "=a" (u64Ret)
1635 , "=d" (u64Spill)
1636 , "+m" (*pu128)
1637 : "a" (u64OldLo)
1638 , "d" (u64OldHi)
1639 , "b" (u64NewLo)
1640 , "c" (u64NewHi)
1641 : "cc");
1642
1643 return (bool)u64Ret;
1644# else
1645# error "Port me"
1646# endif
1647# else
1648# error "Port me"
1649# endif
1650}
1651# endif
1652
1653
1654/**
1655 * Atomically compare and write an unsigned 128-bit value, ordered.
1656 *
1657 * @returns true if write was done.
1658 * @returns false if write wasn't done.
1659 *
1660 * @param pu128 Pointer to the 128-bit variable to update.
1661 * @param u128New The 128-bit value to assign to *pu128.
1662 * @param u128Old The value to compare with.
1663 *
1664 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1665 */
1666DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1667{
1668# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1669# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1670 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1671# else
1672 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1673 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1674# endif
1675# else
1676 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1677# endif
1678}
1679
1680
1681/**
1682 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1683 */
1684DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1685 const RTUINT128U u128Old) RT_NOTHROW_DEF
1686{
1687# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1688 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1689# else
1690 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1691# endif
1692}
1693
1694#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1695
1696/**
1697 * Atomically Compare and Exchange a pointer value, ordered.
1698 *
1699 * @returns true if xchg was done.
1700 * @returns false if xchg wasn't done.
1701 *
1702 * @param ppv Pointer to the value to update.
1703 * @param pvNew The new value to assigned to *ppv.
1704 * @param pvOld The old value to *ppv compare with.
1705 *
1706 * @remarks x86: Requires a 486 or later.
1707 * @todo Rename ASMAtomicCmpWritePtrVoid
1708 */
1709DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1710{
1711#if ARCH_BITS == 32 || ARCH_BITS == 16
1712 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1713#elif ARCH_BITS == 64
1714 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1715#else
1716# error "ARCH_BITS is bogus"
1717#endif
1718}
1719
1720
1721/**
1722 * Atomically Compare and Exchange a pointer value, ordered.
1723 *
1724 * @returns true if xchg was done.
1725 * @returns false if xchg wasn't done.
1726 *
1727 * @param ppv Pointer to the value to update.
1728 * @param pvNew The new value to assigned to *ppv.
1729 * @param pvOld The old value to *ppv compare with.
1730 *
1731 * @remarks This is relatively type safe on GCC platforms.
1732 * @remarks x86: Requires a 486 or later.
1733 * @todo Rename ASMAtomicCmpWritePtr
1734 */
1735#ifdef __GNUC__
1736# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1737 __extension__ \
1738 ({\
1739 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1740 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1741 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1742 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1743 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1744 fMacroRet; \
1745 })
1746#else
1747# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1748 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1749#endif
1750
1751
1752/** @def ASMAtomicCmpXchgHandle
1753 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1754 *
1755 * @param ph Pointer to the value to update.
1756 * @param hNew The new value to assigned to *pu.
1757 * @param hOld The old value to *pu compare with.
1758 * @param fRc Where to store the result.
1759 *
1760 * @remarks This doesn't currently work for all handles (like RTFILE).
1761 * @remarks x86: Requires a 486 or later.
1762 * @todo Rename ASMAtomicCmpWriteHandle
1763 */
1764#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1765# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1766 do { \
1767 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1768 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1769 } while (0)
1770#elif HC_ARCH_BITS == 64
1771# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1772 do { \
1773 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1774 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1775 } while (0)
1776#else
1777# error HC_ARCH_BITS
1778#endif
1779
1780
1781/** @def ASMAtomicCmpXchgSize
1782 * Atomically Compare and Exchange a value which size might differ
1783 * between platforms or compilers, ordered.
1784 *
1785 * @param pu Pointer to the value to update.
1786 * @param uNew The new value to assigned to *pu.
1787 * @param uOld The old value to *pu compare with.
1788 * @param fRc Where to store the result.
1789 *
1790 * @remarks x86: Requires a 486 or later.
1791 * @todo Rename ASMAtomicCmpWriteSize
1792 */
1793#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1794 do { \
1795 switch (sizeof(*(pu))) { \
1796 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1797 break; \
1798 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1799 break; \
1800 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1801 (fRc) = false; \
1802 break; \
1803 } \
1804 } while (0)
1805
1806
1807/**
1808 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1809 * back old value, ordered.
1810 *
1811 * @returns true if xchg was done.
1812 * @returns false if xchg wasn't done.
1813 *
1814 * @param pu8 Pointer to the value to update.
1815 * @param u8New The new value to assigned to *pu32.
1816 * @param u8Old The old value to *pu8 compare with.
1817 * @param pu8Old Pointer store the old value at.
1818 *
1819 * @remarks x86: Requires a 486 or later.
1820 */
1821#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1822RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1823#else
1824DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1825{
1826# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1827# if RT_INLINE_ASM_GNU_STYLE
1828 uint8_t u8Ret;
1829 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1830 "setz %1\n\t"
1831 : "=m" (*pu8)
1832 , "=qm" (u8Ret)
1833 , "=a" (*pu8Old)
1834# if defined(RT_ARCH_X86)
1835 : "q" (u8New)
1836# else
1837 : "r" (u8New)
1838# endif
1839 , "a" (u8Old)
1840 , "m" (*pu8)
1841 : "cc");
1842 return (bool)u8Ret;
1843
1844# elif RT_INLINE_ASM_USES_INTRIN
1845 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1846
1847# else
1848 uint8_t u8Ret;
1849 __asm
1850 {
1851# ifdef RT_ARCH_AMD64
1852 mov rdx, [pu8]
1853# else
1854 mov edx, [pu8]
1855# endif
1856 mov eax, [u8Old]
1857 mov ecx, [u8New]
1858# ifdef RT_ARCH_AMD64
1859 lock cmpxchg [rdx], ecx
1860 mov rdx, [pu8Old]
1861 mov [rdx], eax
1862# else
1863 lock cmpxchg [edx], ecx
1864 mov edx, [pu8Old]
1865 mov [edx], eax
1866# endif
1867 setz al
1868 movzx eax, al
1869 mov [u8Ret], eax
1870 }
1871 return !!u8Ret;
1872# endif
1873
1874# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1875 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
1876 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
1877# if defined(RTASM_ARM64_USE_FEAT_LSE)
1878 union { uint32_t u; bool f; } fXchg;
1879 uint32_t u32Actual;
1880 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
1881# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1882 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1883# else
1884 RTASM_ARM_DMB_SY
1885 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1886# endif
1887 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1888 "cset %w[fXchg], eq\n\t"
1889 : [pMem] "+Q" (*pu8)
1890 , [uOldActual] "=&r" (u32Actual)
1891 , [fXchg] "=&r" (fXchg.u)
1892 : [uNew] "r" ((uint32_t)u8New)
1893 , [uOldOrg] "r" ((uint32_t)u8Old)
1894 , "[uOldActual]" ((uint32_t)u8Old)
1895 : "cc");
1896 *pu8Old = (uint8_t)u32Actual;
1897# else
1898 union { uint8_t u; bool f; } fXchg;
1899 uint8_t u8ActualOld;
1900 uint8_t rcSpill;
1901 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
1902 RTASM_ARM_DMB_SY
1903# if defined(RT_ARCH_ARM64)
1904 "ldaxrb %w[uOld], %[pMem]\n\t"
1905 "cmp %w[uOld], %w[uCmp]\n\t"
1906 "bne 1f\n\t" /* stop here if not equal */
1907 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1908 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1909 "mov %w[fXchg], #1\n\t"
1910 "1:\n\t"
1911 "clrex\n\t"
1912# else
1913 "ldrexb %[uOld], %[pMem]\n\t"
1914 "teq %[uOld], %[uCmp]\n\t"
1915 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1916 "bne 1f\n\t" /* stop here if not equal */
1917 "cmp %[rc], #0\n\t"
1918 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
1919 "mov %[fXchg], #1\n\t"
1920 "1:\n\t"
1921 /** @todo clrexne on armv7? */
1922# endif
1923 : [pMem] "+Q" (*pu8)
1924 , [uOld] "=&r" (u8ActualOld)
1925 , [rc] "=&r" (rcSpill)
1926 , [fXchg] "=&r" (fXchg.u)
1927 : [uCmp] "r" (u8Old)
1928 , [uNew] "r" (u8New)
1929 , "[fXchg]" (0)
1930 RTASM_ARM_DMB_SY_COMMA_IN_REG
1931 : "cc");
1932 *pu8Old = u8ActualOld;
1933# endif
1934 return fXchg.f;
1935
1936# else
1937# error "Port me"
1938# endif
1939}
1940#endif
1941
1942
1943/**
1944 * Atomically Compare and Exchange a signed 8-bit value, additionally
1945 * passes back old value, ordered.
1946 *
1947 * @returns true if xchg was done.
1948 * @returns false if xchg wasn't done.
1949 *
1950 * @param pi8 Pointer to the value to update.
1951 * @param i8New The new value to assigned to *pi8.
1952 * @param i8Old The old value to *pi8 compare with.
1953 * @param pi8Old Pointer store the old value at.
1954 *
1955 * @remarks x86: Requires a 486 or later.
1956 */
1957DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
1958{
1959 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
1960}
1961
1962
1963/**
1964 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
1965 * back old value, ordered.
1966 *
1967 * @returns true if xchg was done.
1968 * @returns false if xchg wasn't done.
1969 *
1970 * @param pu16 Pointer to the value to update.
1971 * @param u16New The new value to assigned to *pu16.
1972 * @param u16Old The old value to *pu32 compare with.
1973 * @param pu16Old Pointer store the old value at.
1974 *
1975 * @remarks x86: Requires a 486 or later.
1976 */
1977#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1978RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
1979#else
1980DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
1981{
1982# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1983# if RT_INLINE_ASM_GNU_STYLE
1984 uint8_t u8Ret;
1985 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
1986 "setz %1\n\t"
1987 : "=m" (*pu16)
1988 , "=qm" (u8Ret)
1989 , "=a" (*pu16Old)
1990 : "r" (u16New)
1991 , "a" (u16Old)
1992 , "m" (*pu16)
1993 : "cc");
1994 return (bool)u8Ret;
1995
1996# elif RT_INLINE_ASM_USES_INTRIN
1997 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
1998
1999# else
2000 uint16_t u16Ret;
2001 __asm
2002 {
2003# ifdef RT_ARCH_AMD64
2004 mov rdx, [pu16]
2005# else
2006 mov edx, [pu16]
2007# endif
2008 mov eax, [u16Old]
2009 mov ecx, [u16New]
2010# ifdef RT_ARCH_AMD64
2011 lock cmpxchg [rdx], ecx
2012 mov rdx, [pu16Old]
2013 mov [rdx], eax
2014# else
2015 lock cmpxchg [edx], ecx
2016 mov edx, [pu16Old]
2017 mov [edx], eax
2018# endif
2019 setz al
2020 movzx eax, al
2021 mov [u16Ret], eax
2022 }
2023 return !!u16Ret;
2024# endif
2025
2026# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2027 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2028 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2029# if defined(RTASM_ARM64_USE_FEAT_LSE)
2030 union { uint32_t u; bool f; } fXchg;
2031 uint32_t u32Actual;
2032 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2033# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2034 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2035# else
2036 RTASM_ARM_DMB_SY
2037 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2038# endif
2039 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2040 "cset %w[fXchg], eq\n\t"
2041 : [pMem] "+Q" (*pu16)
2042 , [uOldActual] "=&r" (u32Actual)
2043 , [fXchg] "=&r" (fXchg.u)
2044 : [uNew] "r" ((uint32_t)u16New)
2045 , [uOldOrg] "r" ((uint32_t)u16Old)
2046 , "[uOldActual]" ((uint32_t)u16Old)
2047 : "cc");
2048 *pu16Old = (uint16_t)u32Actual;
2049# else
2050 union { uint16_t u; bool f; } fXchg;
2051 uint16_t u16ActualOld;
2052 uint16_t rcSpill;
2053 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2054 RTASM_ARM_DMB_SY
2055# if defined(RT_ARCH_ARM64)
2056 "ldaxrh %w[uOld], %[pMem]\n\t"
2057 "cmp %w[uOld], %w[uCmp]\n\t"
2058 "bne 1f\n\t" /* stop here if not equal */
2059 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2060 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2061 "mov %w[fXchg], #1\n\t"
2062 "1:\n\t"
2063 "clrex\n\t"
2064# else
2065 "ldrexh %[uOld], %[pMem]\n\t"
2066 "teq %[uOld], %[uCmp]\n\t"
2067 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2068 "bne 1f\n\t" /* stop here if not equal */
2069 "cmp %[rc], #0\n\t"
2070 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2071 "mov %[fXchg], #1\n\t"
2072 "1:\n\t"
2073 /** @todo clrexne on armv7? */
2074# endif
2075 : [pMem] "+Q" (*pu16)
2076 , [uOld] "=&r" (u16ActualOld)
2077 , [rc] "=&r" (rcSpill)
2078 , [fXchg] "=&r" (fXchg.u)
2079 : [uCmp] "r" (u16Old)
2080 , [uNew] "r" (u16New)
2081 , "[fXchg]" (0)
2082 RTASM_ARM_DMB_SY_COMMA_IN_REG
2083 : "cc");
2084 *pu16Old = u16ActualOld;
2085# endif
2086 return fXchg.f;
2087
2088# else
2089# error "Port me"
2090# endif
2091}
2092#endif
2093
2094
2095/**
2096 * Atomically Compare and Exchange a signed 16-bit value, additionally
2097 * passes back old value, ordered.
2098 *
2099 * @returns true if xchg was done.
2100 * @returns false if xchg wasn't done.
2101 *
2102 * @param pi16 Pointer to the value to update.
2103 * @param i16New The new value to assigned to *pi16.
2104 * @param i16Old The old value to *pi16 compare with.
2105 * @param pi16Old Pointer store the old value at.
2106 *
2107 * @remarks x86: Requires a 486 or later.
2108 */
2109DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2110{
2111 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2112}
2113
2114
2115/**
2116 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2117 * passes back old value, ordered.
2118 *
2119 * @returns true if xchg was done.
2120 * @returns false if xchg wasn't done.
2121 *
2122 * @param pu32 Pointer to the value to update.
2123 * @param u32New The new value to assigned to *pu32.
2124 * @param u32Old The old value to *pu32 compare with.
2125 * @param pu32Old Pointer store the old value at.
2126 *
2127 * @remarks x86: Requires a 486 or later.
2128 */
2129#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2130RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2131#else
2132DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2133{
2134# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2135# if RT_INLINE_ASM_GNU_STYLE
2136 uint8_t u8Ret;
2137 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2138 "setz %1\n\t"
2139 : "=m" (*pu32)
2140 , "=qm" (u8Ret)
2141 , "=a" (*pu32Old)
2142 : "r" (u32New)
2143 , "a" (u32Old)
2144 , "m" (*pu32)
2145 : "cc");
2146 return (bool)u8Ret;
2147
2148# elif RT_INLINE_ASM_USES_INTRIN
2149 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2150
2151# else
2152 uint32_t u32Ret;
2153 __asm
2154 {
2155# ifdef RT_ARCH_AMD64
2156 mov rdx, [pu32]
2157# else
2158 mov edx, [pu32]
2159# endif
2160 mov eax, [u32Old]
2161 mov ecx, [u32New]
2162# ifdef RT_ARCH_AMD64
2163 lock cmpxchg [rdx], ecx
2164 mov rdx, [pu32Old]
2165 mov [rdx], eax
2166# else
2167 lock cmpxchg [edx], ecx
2168 mov edx, [pu32Old]
2169 mov [edx], eax
2170# endif
2171 setz al
2172 movzx eax, al
2173 mov [u32Ret], eax
2174 }
2175 return !!u32Ret;
2176# endif
2177
2178# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2179 union { uint32_t u; bool f; } fXchg;
2180 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2181 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2182# if defined(RTASM_ARM64_USE_FEAT_LSE)
2183 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2184# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2185 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2186# else
2187 RTASM_ARM_DMB_SY
2188 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2189# endif
2190 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2191 "cset %w[fXchg], eq\n\t"
2192 : [pMem] "+Q" (*pu32)
2193 , [uOldActual] "=&r" (*pu32Old)
2194 , [fXchg] "=&r" (fXchg.u)
2195 : [uNew] "r" (u32New)
2196 , [uOldOrg] "r" (u32Old)
2197 , "[uOldActual]" (u32Old)
2198 : "cc");
2199# else
2200 uint32_t u32ActualOld;
2201 uint32_t rcSpill;
2202 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2203 RTASM_ARM_DMB_SY
2204# if defined(RT_ARCH_ARM64)
2205 "ldaxr %w[uOld], %[pMem]\n\t"
2206 "cmp %w[uOld], %w[uCmp]\n\t"
2207 "bne 1f\n\t" /* stop here if not equal */
2208 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2209 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2210 "mov %w[fXchg], #1\n\t"
2211 "1:\n\t"
2212 "clrex\n\t"
2213# else
2214 "ldrex %[uOld], %[pMem]\n\t"
2215 "teq %[uOld], %[uCmp]\n\t"
2216 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2217 "bne 1f\n\t" /* stop here if not equal */
2218 "cmp %[rc], #0\n\t"
2219 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2220 "mov %[fXchg], #1\n\t"
2221 "1:\n\t"
2222 /** @todo clrexne on armv7? */
2223# endif
2224 : [pMem] "+Q" (*pu32)
2225 , [uOld] "=&r" (u32ActualOld)
2226 , [rc] "=&r" (rcSpill)
2227 , [fXchg] "=&r" (fXchg.u)
2228 : [uCmp] "r" (u32Old)
2229 , [uNew] "r" (u32New)
2230 , "[fXchg]" (0)
2231 RTASM_ARM_DMB_SY_COMMA_IN_REG
2232 : "cc");
2233 *pu32Old = u32ActualOld;
2234# endif
2235 return fXchg.f;
2236
2237# else
2238# error "Port me"
2239# endif
2240}
2241#endif
2242
2243
2244/**
2245 * Atomically Compare and Exchange a signed 32-bit value, additionally
2246 * passes back old value, ordered.
2247 *
2248 * @returns true if xchg was done.
2249 * @returns false if xchg wasn't done.
2250 *
2251 * @param pi32 Pointer to the value to update.
2252 * @param i32New The new value to assigned to *pi32.
2253 * @param i32Old The old value to *pi32 compare with.
2254 * @param pi32Old Pointer store the old value at.
2255 *
2256 * @remarks x86: Requires a 486 or later.
2257 */
2258DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2259{
2260 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2261}
2262
2263
2264/**
2265 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2266 * passing back old value, ordered.
2267 *
2268 * @returns true if xchg was done.
2269 * @returns false if xchg wasn't done.
2270 *
2271 * @param pu64 Pointer to the 64-bit variable to update.
2272 * @param u64New The 64-bit value to assign to *pu64.
2273 * @param u64Old The value to compare with.
2274 * @param pu64Old Pointer store the old value at.
2275 *
2276 * @remarks x86: Requires a Pentium or later.
2277 */
2278#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2279 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2280RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2281#else
2282DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2283{
2284# if RT_INLINE_ASM_USES_INTRIN
2285 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2286
2287# elif defined(RT_ARCH_AMD64)
2288# if RT_INLINE_ASM_GNU_STYLE
2289 uint8_t u8Ret;
2290 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2291 "setz %1\n\t"
2292 : "=m" (*pu64)
2293 , "=qm" (u8Ret)
2294 , "=a" (*pu64Old)
2295 : "r" (u64New)
2296 , "a" (u64Old)
2297 , "m" (*pu64)
2298 : "cc");
2299 return (bool)u8Ret;
2300# else
2301 bool fRet;
2302 __asm
2303 {
2304 mov rdx, [pu32]
2305 mov rax, [u64Old]
2306 mov rcx, [u64New]
2307 lock cmpxchg [rdx], rcx
2308 mov rdx, [pu64Old]
2309 mov [rdx], rax
2310 setz al
2311 mov [fRet], al
2312 }
2313 return fRet;
2314# endif
2315
2316# elif defined(RT_ARCH_X86)
2317# if RT_INLINE_ASM_GNU_STYLE
2318 uint64_t u64Ret;
2319# if defined(PIC) || defined(__PIC__)
2320 /* Note #1: This code uses a memory clobber description, because the clean
2321 solution with an output value for *pu64 makes gcc run out of
2322 registers. This will cause suboptimal code, and anyone with a
2323 better solution is welcome to improve this.
2324
2325 Note #2: We must prevent gcc from encoding the memory access, as it
2326 may go via the GOT if we're working on a global variable (like
2327 in the testcase). Thus we request a register (%3) and
2328 dereference it ourselves. */
2329 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2330 "lock; cmpxchg8b (%3)\n\t"
2331 "xchgl %%ebx, %1\n\t"
2332 : "=A" (u64Ret)
2333 : "DS" ((uint32_t)u64New)
2334 , "c" ((uint32_t)(u64New >> 32))
2335 , "r" (pu64) /* Do not use "m" here*/
2336 , "0" (u64Old)
2337 : "memory"
2338 , "cc" );
2339# else /* !PIC */
2340 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2341 : "=A" (u64Ret)
2342 , "=m" (*pu64)
2343 : "b" ((uint32_t)u64New)
2344 , "c" ((uint32_t)(u64New >> 32))
2345 , "m" (*pu64)
2346 , "0" (u64Old)
2347 : "cc");
2348# endif
2349 *pu64Old = u64Ret;
2350 return u64Ret == u64Old;
2351# else
2352 uint32_t u32Ret;
2353 __asm
2354 {
2355 mov ebx, dword ptr [u64New]
2356 mov ecx, dword ptr [u64New + 4]
2357 mov edi, [pu64]
2358 mov eax, dword ptr [u64Old]
2359 mov edx, dword ptr [u64Old + 4]
2360 lock cmpxchg8b [edi]
2361 mov ebx, [pu64Old]
2362 mov [ebx], eax
2363 setz al
2364 movzx eax, al
2365 add ebx, 4
2366 mov [ebx], edx
2367 mov dword ptr [u32Ret], eax
2368 }
2369 return !!u32Ret;
2370# endif
2371
2372# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2373 union { uint32_t u; bool f; } fXchg;
2374 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2375 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2376# if defined(RTASM_ARM64_USE_FEAT_LSE)
2377 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2378# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2379 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2380# else
2381 RTASM_ARM_DMB_SY
2382 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2383# endif
2384 "cmp %[uOldActual], %[uOldOrg]\n\t"
2385 "cset %w[fXchg], eq\n\t"
2386 : [pMem] "+Q" (*pu64)
2387 , [uOldActual] "=&r" (*pu64Old)
2388 , [fXchg] "=&r" (fXchg.u)
2389 : [uNew] "r" (u64New)
2390 , [uOldOrg] "r" (u64Old)
2391 , "[uOldActual]" (u64Old)
2392 : "cc");
2393# else
2394 uint64_t u64ActualOld;
2395 uint32_t rcSpill;
2396 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2397 RTASM_ARM_DMB_SY
2398# if defined(RT_ARCH_ARM64)
2399 "ldaxr %[uOld], %[pMem]\n\t"
2400 "cmp %[uOld], %[uCmp]\n\t"
2401 "bne 1f\n\t" /* stop here if not equal */
2402 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2403 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2404 "mov %w[fXchg], #1\n\t"
2405 "1:\n\t"
2406 "clrex\n\t"
2407# else
2408 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2409 "teq %[uOld], %[uCmp]\n\t"
2410 "teqeq %H[uOld], %H[uCmp]\n\t"
2411 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2412 "bne 1f\n\t" /* stop here if not equal */
2413 "cmp %[rc], #0\n\t"
2414 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2415 "mov %[fXchg], #1\n\t"
2416 "1:\n\t"
2417 /** @todo clrexne on armv7? */
2418# endif
2419 : [pMem] "+Q" (*pu64)
2420 , [uOld] "=&r" (u64ActualOld)
2421 , [rc] "=&r" (rcSpill)
2422 , [fXchg] "=&r" (fXchg.u)
2423 : [uCmp] "r" (u64Old)
2424 , [uNew] "r" (u64New)
2425 , "[fXchg]" (0)
2426 RTASM_ARM_DMB_SY_COMMA_IN_REG
2427 : "cc");
2428 *pu64Old = u64ActualOld;
2429# endif
2430 return fXchg.f;
2431
2432# else
2433# error "Port me"
2434# endif
2435}
2436#endif
2437
2438
2439/**
2440 * Atomically Compare and exchange a signed 64-bit value, additionally
2441 * passing back old value, ordered.
2442 *
2443 * @returns true if xchg was done.
2444 * @returns false if xchg wasn't done.
2445 *
2446 * @param pi64 Pointer to the 64-bit variable to update.
2447 * @param i64 The 64-bit value to assign to *pu64.
2448 * @param i64Old The value to compare with.
2449 * @param pi64Old Pointer store the old value at.
2450 *
2451 * @remarks x86: Requires a Pentium or later.
2452 */
2453DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2454{
2455 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2456}
2457
2458#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2459
2460/** @def RTASM_HAVE_CMP_XCHG_U128
2461 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2462 * and ASMAtomicCmpSwapExU128() available. */
2463# define RTASM_HAVE_CMP_XCHG_U128 1
2464
2465
2466/**
2467 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2468 *
2469 * @returns true if exchange was done.
2470 * @returns false if exchange wasn't done.
2471 *
2472 * @param pu128 Pointer to the 128-bit variable to update.
2473 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2474 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2475 * @param u64OldHi The high 64-bit of the value to compare with.
2476 * @param u64OldLo The low 64-bit of the value to compare with.
2477 * @param pu128Old Where to return the old value.
2478 *
2479 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2480 */
2481# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2482DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2483 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2484# else
2485DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2486 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2487{
2488# if RT_INLINE_ASM_USES_INTRIN
2489 pu128Old->Hi = u64OldHi;
2490 pu128Old->Lo = u64OldLo;
2491 AssertCompileMemberOffset(uint128_t, Lo, 0);
2492 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2493
2494# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2495 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2496 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2497 *pu128Old = uOld;
2498 return uCmp == uOld;
2499
2500# elif defined(RT_ARCH_AMD64)
2501# if RT_INLINE_ASM_GNU_STYLE
2502 uint8_t bRet;
2503 uint64_t u64RetHi, u64RetLo;
2504 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2505 "setz %b0\n\t"
2506 : "=r" (bRet)
2507 , "=a" (u64RetLo)
2508 , "=d" (u64RetHi)
2509 , "+m" (*pu128)
2510 : "a" (u64OldLo)
2511 , "d" (u64OldHi)
2512 , "b" (u64NewLo)
2513 , "c" (u64NewHi)
2514 : "cc");
2515 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2516 return (bool)bRet;
2517# else
2518# error "Port me"
2519# endif
2520# else
2521# error "Port me"
2522# endif
2523}
2524# endif
2525
2526
2527/**
2528 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2529 *
2530 * @returns true if exchange was done.
2531 * @returns false if exchange wasn't done.
2532 *
2533 * @param pu128 Pointer to the 128-bit variable to update.
2534 * @param u128New The 128-bit value to assign to *pu128.
2535 * @param u128Old The value to compare with.
2536 * @param pu128Old Where to return the old value.
2537 *
2538 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2539 */
2540DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2541 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2542{
2543# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2544# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2545 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2546 *pu128Old = uSwapped;
2547 return uSwapped == u128Old;
2548# else
2549 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2550 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2551# endif
2552# else
2553 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2554# endif
2555}
2556
2557
2558/**
2559 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2560 */
2561DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2562 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2563{
2564# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2565 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2566# else
2567 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2568# endif
2569}
2570
2571#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2572
2573
2574
2575/** @def ASMAtomicCmpXchgExHandle
2576 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2577 *
2578 * @param ph Pointer to the value to update.
2579 * @param hNew The new value to assigned to *pu.
2580 * @param hOld The old value to *pu compare with.
2581 * @param fRc Where to store the result.
2582 * @param phOldVal Pointer to where to store the old value.
2583 *
2584 * @remarks This doesn't currently work for all handles (like RTFILE).
2585 */
2586#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2587# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2588 do { \
2589 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2590 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2591 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2592 } while (0)
2593#elif HC_ARCH_BITS == 64
2594# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2595 do { \
2596 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2597 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2598 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2599 } while (0)
2600#else
2601# error HC_ARCH_BITS
2602#endif
2603
2604
2605/** @def ASMAtomicCmpXchgExSize
2606 * Atomically Compare and Exchange a value which size might differ
2607 * between platforms or compilers. Additionally passes back old value.
2608 *
2609 * @param pu Pointer to the value to update.
2610 * @param uNew The new value to assigned to *pu.
2611 * @param uOld The old value to *pu compare with.
2612 * @param fRc Where to store the result.
2613 * @param puOldVal Pointer to where to store the old value.
2614 *
2615 * @remarks x86: Requires a 486 or later.
2616 */
2617#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2618 do { \
2619 switch (sizeof(*(pu))) { \
2620 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2621 break; \
2622 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2623 break; \
2624 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2625 (fRc) = false; \
2626 (uOldVal) = 0; \
2627 break; \
2628 } \
2629 } while (0)
2630
2631
2632/**
2633 * Atomically Compare and Exchange a pointer value, additionally
2634 * passing back old value, ordered.
2635 *
2636 * @returns true if xchg was done.
2637 * @returns false if xchg wasn't done.
2638 *
2639 * @param ppv Pointer to the value to update.
2640 * @param pvNew The new value to assigned to *ppv.
2641 * @param pvOld The old value to *ppv compare with.
2642 * @param ppvOld Pointer store the old value at.
2643 *
2644 * @remarks x86: Requires a 486 or later.
2645 */
2646DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2647 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2648{
2649#if ARCH_BITS == 32 || ARCH_BITS == 16
2650 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2651#elif ARCH_BITS == 64
2652 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2653#else
2654# error "ARCH_BITS is bogus"
2655#endif
2656}
2657
2658
2659/**
2660 * Atomically Compare and Exchange a pointer value, additionally
2661 * passing back old value, ordered.
2662 *
2663 * @returns true if xchg was done.
2664 * @returns false if xchg wasn't done.
2665 *
2666 * @param ppv Pointer to the value to update.
2667 * @param pvNew The new value to assigned to *ppv.
2668 * @param pvOld The old value to *ppv compare with.
2669 * @param ppvOld Pointer store the old value at.
2670 *
2671 * @remarks This is relatively type safe on GCC platforms.
2672 * @remarks x86: Requires a 486 or later.
2673 */
2674#ifdef __GNUC__
2675# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2676 __extension__ \
2677 ({\
2678 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2679 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2680 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2681 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2682 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2683 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2684 (void **)ppvOldTypeChecked); \
2685 fMacroRet; \
2686 })
2687#else
2688# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2689 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2690#endif
2691
2692
2693/**
2694 * Virtualization unfriendly serializing instruction, always exits.
2695 */
2696#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2697RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2698#else
2699DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2700{
2701# if RT_INLINE_ASM_GNU_STYLE
2702 RTCCUINTREG xAX = 0;
2703# ifdef RT_ARCH_AMD64
2704 __asm__ __volatile__ ("cpuid"
2705 : "=a" (xAX)
2706 : "0" (xAX)
2707 : "rbx", "rcx", "rdx", "memory");
2708# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2709 __asm__ __volatile__ ("push %%ebx\n\t"
2710 "cpuid\n\t"
2711 "pop %%ebx\n\t"
2712 : "=a" (xAX)
2713 : "0" (xAX)
2714 : "ecx", "edx", "memory");
2715# else
2716 __asm__ __volatile__ ("cpuid"
2717 : "=a" (xAX)
2718 : "0" (xAX)
2719 : "ebx", "ecx", "edx", "memory");
2720# endif
2721
2722# elif RT_INLINE_ASM_USES_INTRIN
2723 int aInfo[4];
2724 _ReadWriteBarrier();
2725 __cpuid(aInfo, 0);
2726
2727# else
2728 __asm
2729 {
2730 push ebx
2731 xor eax, eax
2732 cpuid
2733 pop ebx
2734 }
2735# endif
2736}
2737#endif
2738
2739/**
2740 * Virtualization friendly serializing instruction, though more expensive.
2741 */
2742#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2743RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2744#else
2745DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2746{
2747# if RT_INLINE_ASM_GNU_STYLE
2748# ifdef RT_ARCH_AMD64
2749 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2750 "subq $128, %%rsp\n\t" /*redzone*/
2751 "mov %%ss, %%eax\n\t"
2752 "pushq %%rax\n\t"
2753 "pushq %%r10\n\t"
2754 "pushfq\n\t"
2755 "movl %%cs, %%eax\n\t"
2756 "pushq %%rax\n\t"
2757 "leaq 1f(%%rip), %%rax\n\t"
2758 "pushq %%rax\n\t"
2759 "iretq\n\t"
2760 "1:\n\t"
2761 ::: "rax", "r10", "memory", "cc");
2762# else
2763 __asm__ __volatile__ ("pushfl\n\t"
2764 "pushl %%cs\n\t"
2765 "pushl $1f\n\t"
2766 "iretl\n\t"
2767 "1:\n\t"
2768 ::: "memory");
2769# endif
2770
2771# else
2772 __asm
2773 {
2774 pushfd
2775 push cs
2776 push la_ret
2777 iretd
2778 la_ret:
2779 }
2780# endif
2781}
2782#endif
2783
2784/**
2785 * Virtualization friendlier serializing instruction, may still cause exits.
2786 */
2787#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2788RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2789#else
2790DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2791{
2792# if RT_INLINE_ASM_GNU_STYLE
2793 /* rdtscp is not supported by ancient linux build VM of course :-( */
2794# ifdef RT_ARCH_AMD64
2795 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2796 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2797# else
2798 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2799 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2800# endif
2801# else
2802# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2803 uint32_t uIgnore;
2804 _ReadWriteBarrier();
2805 (void)__rdtscp(&uIgnore);
2806 (void)uIgnore;
2807# else
2808 __asm
2809 {
2810 rdtscp
2811 }
2812# endif
2813# endif
2814}
2815#endif
2816
2817
2818/**
2819 * Serialize Instruction (both data store and instruction flush).
2820 */
2821#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2822# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2823#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2824# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
2825#elif defined(RT_ARCH_SPARC64)
2826RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
2827#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2828DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
2829{
2830 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
2831}
2832#else
2833# error "Port me"
2834#endif
2835
2836
2837/**
2838 * Memory fence, waits for any pending writes and reads to complete.
2839 * @note No implicit compiler barrier (which is probably stupid).
2840 */
2841DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
2842{
2843#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2844# if RT_INLINE_ASM_GNU_STYLE
2845 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
2846# elif RT_INLINE_ASM_USES_INTRIN
2847 _mm_mfence();
2848# else
2849 __asm
2850 {
2851 _emit 0x0f
2852 _emit 0xae
2853 _emit 0xf0
2854 }
2855# endif
2856#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2857 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
2858#elif ARCH_BITS == 16
2859 uint16_t volatile u16;
2860 ASMAtomicXchgU16(&u16, 0);
2861#else
2862 uint32_t volatile u32;
2863 ASMAtomicXchgU32(&u32, 0);
2864#endif
2865}
2866
2867
2868/**
2869 * Write fence, waits for any pending writes to complete.
2870 * @note No implicit compiler barrier (which is probably stupid).
2871 */
2872DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
2873{
2874#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2875# if RT_INLINE_ASM_GNU_STYLE
2876 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
2877# elif RT_INLINE_ASM_USES_INTRIN
2878 _mm_sfence();
2879# else
2880 __asm
2881 {
2882 _emit 0x0f
2883 _emit 0xae
2884 _emit 0xf8
2885 }
2886# endif
2887#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2888 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
2889#else
2890 ASMMemoryFence();
2891#endif
2892}
2893
2894
2895/**
2896 * Read fence, waits for any pending reads to complete.
2897 * @note No implicit compiler barrier (which is probably stupid).
2898 */
2899DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
2900{
2901#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
2902# if RT_INLINE_ASM_GNU_STYLE
2903 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
2904# elif RT_INLINE_ASM_USES_INTRIN
2905 _mm_lfence();
2906# else
2907 __asm
2908 {
2909 _emit 0x0f
2910 _emit 0xae
2911 _emit 0xe8
2912 }
2913# endif
2914#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2915 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
2916#else
2917 ASMMemoryFence();
2918#endif
2919}
2920
2921
2922/**
2923 * Atomically reads an unsigned 8-bit value, ordered.
2924 *
2925 * @returns Current *pu8 value
2926 * @param pu8 Pointer to the 8-bit variable to read.
2927 */
2928DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2929{
2930#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2931 uint32_t u32;
2932# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
2933 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2934 RTASM_ARM_DMB_SY
2935 "casab %w[uDst], wzr, %[pMem]\n\t"
2936 : [uDst] "=&r" (u32)
2937 : [pMem] "Q" (*pu8),
2938 "0" (0)
2939 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2940# else
2941 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
2942 RTASM_ARM_DMB_SY
2943# if defined(RT_ARCH_ARM64)
2944# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
2945 "ldurb %w[uDst], %[pMem]\n\t"
2946# else
2947 "ldxrb %w[uDst], %[pMem]\n\t"
2948 "clrex\n\t"
2949# endif
2950# else
2951 "ldrexb %[uDst], %[pMem]\n\t"
2952 /** @todo clrex */
2953# endif
2954 : [uDst] "=&r" (u32)
2955 : [pMem] "Q" (*pu8)
2956 RTASM_ARM_DMB_SY_COMMA_IN_REG);
2957# endif
2958 return (uint8_t)u32;
2959#else
2960 ASMMemoryFence();
2961 return *pu8; /* byte reads are atomic on x86 */
2962#endif
2963}
2964
2965
2966/**
2967 * Atomically reads an unsigned 8-bit value, unordered.
2968 *
2969 * @returns Current *pu8 value
2970 * @param pu8 Pointer to the 8-bit variable to read.
2971 */
2972DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
2973{
2974#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2975 uint32_t u32;
2976 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
2977# if defined(RT_ARCH_ARM64)
2978 "ldurb %w[uDst], %[pMem]\n\t"
2979# else
2980 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
2981# endif
2982 : [uDst] "=&r" (u32)
2983 : [pMem] "Q" (*pu8));
2984 return (uint8_t)u32;
2985#else
2986 return *pu8; /* byte reads are atomic on x86 */
2987#endif
2988}
2989
2990
2991/**
2992 * Atomically reads a signed 8-bit value, ordered.
2993 *
2994 * @returns Current *pi8 value
2995 * @param pi8 Pointer to the 8-bit variable to read.
2996 */
2997DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
2998{
2999#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3000 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3001#else
3002 ASMMemoryFence();
3003 return *pi8; /* byte reads are atomic on x86 */
3004#endif
3005}
3006
3007
3008/**
3009 * Atomically reads a signed 8-bit value, unordered.
3010 *
3011 * @returns Current *pi8 value
3012 * @param pi8 Pointer to the 8-bit variable to read.
3013 */
3014DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3015{
3016#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3017 int32_t i32;
3018 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3019# if defined(RT_ARCH_ARM64)
3020 "ldurb %w[iDst], %[pMem]\n\t"
3021# else
3022 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3023# endif
3024 : [iDst] "=&r" (i32)
3025 : [pMem] "Q" (*pi8));
3026 return (int8_t)i32;
3027#else
3028 return *pi8; /* byte reads are atomic on x86 */
3029#endif
3030}
3031
3032
3033/**
3034 * Atomically reads an unsigned 16-bit value, ordered.
3035 *
3036 * @returns Current *pu16 value
3037 * @param pu16 Pointer to the 16-bit variable to read.
3038 */
3039DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3040{
3041 Assert(!((uintptr_t)pu16 & 1));
3042#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3043 uint32_t u32;
3044# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3045 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3046 RTASM_ARM_DMB_SY
3047 "casah %w[uDst], wzr, %[pMem]\n\t"
3048 : [uDst] "=&r" (u32)
3049 : [pMem] "Q" (*pu16),
3050 "0" (0)
3051 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3052# else
3053 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3054 RTASM_ARM_DMB_SY
3055# if defined(RT_ARCH_ARM64)
3056# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3057 "ldurh %w[uDst], %[pMem]\n\t"
3058# else
3059 "ldxrh %w[uDst], %[pMem]\n\t"
3060 "clrex\n\t"
3061# endif
3062# else
3063 "ldrexh %[uDst], %[pMem]\n\t"
3064 /** @todo clrex */
3065# endif
3066 : [uDst] "=&r" (u32)
3067 : [pMem] "Q" (*pu16)
3068 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3069# endif
3070 return (uint16_t)u32;
3071#else
3072 ASMMemoryFence();
3073 return *pu16;
3074#endif
3075}
3076
3077
3078/**
3079 * Atomically reads an unsigned 16-bit value, unordered.
3080 *
3081 * @returns Current *pu16 value
3082 * @param pu16 Pointer to the 16-bit variable to read.
3083 */
3084DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3085{
3086 Assert(!((uintptr_t)pu16 & 1));
3087#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3088 uint32_t u32;
3089 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3090# if defined(RT_ARCH_ARM64)
3091 "ldurh %w[uDst], %[pMem]\n\t"
3092# else
3093 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3094# endif
3095 : [uDst] "=&r" (u32)
3096 : [pMem] "Q" (*pu16));
3097 return (uint16_t)u32;
3098#else
3099 return *pu16;
3100#endif
3101}
3102
3103
3104/**
3105 * Atomically reads a signed 16-bit value, ordered.
3106 *
3107 * @returns Current *pi16 value
3108 * @param pi16 Pointer to the 16-bit variable to read.
3109 */
3110DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3111{
3112 Assert(!((uintptr_t)pi16 & 1));
3113#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3114 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3115#else
3116 ASMMemoryFence();
3117 return *pi16;
3118#endif
3119}
3120
3121
3122/**
3123 * Atomically reads a signed 16-bit value, unordered.
3124 *
3125 * @returns Current *pi16 value
3126 * @param pi16 Pointer to the 16-bit variable to read.
3127 */
3128DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3129{
3130 Assert(!((uintptr_t)pi16 & 1));
3131#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3132 int32_t i32;
3133 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3134# if defined(RT_ARCH_ARM64)
3135 "ldurh %w[iDst], %[pMem]\n\t"
3136# else
3137 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3138# endif
3139 : [iDst] "=&r" (i32)
3140 : [pMem] "Q" (*pi16));
3141 return (int16_t)i32;
3142#else
3143 return *pi16;
3144#endif
3145}
3146
3147
3148/**
3149 * Atomically reads an unsigned 32-bit value, ordered.
3150 *
3151 * @returns Current *pu32 value
3152 * @param pu32 Pointer to the 32-bit variable to read.
3153 */
3154DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3155{
3156 Assert(!((uintptr_t)pu32 & 3));
3157#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3158 uint32_t u32;
3159# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3160 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3161 RTASM_ARM_DMB_SY
3162 "casa %w[uDst], wzr, %[pMem]\n\t"
3163 : [uDst] "=&r" (u32)
3164 : [pMem] "Q" (*pu32),
3165 "0" (0)
3166 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3167# else
3168 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3169 RTASM_ARM_DMB_SY
3170# if defined(RT_ARCH_ARM64)
3171# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3172 "ldur %w[uDst], %[pMem]\n\t"
3173# else
3174 "ldxr %w[uDst], %[pMem]\n\t"
3175 "clrex\n\t"
3176# endif
3177# else
3178 "ldrex %[uDst], %[pMem]\n\t"
3179 /** @todo clrex */
3180# endif
3181 : [uDst] "=&r" (u32)
3182 : [pMem] "Q" (*pu32)
3183 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3184# endif
3185 return u32;
3186#else
3187 ASMMemoryFence();
3188# if ARCH_BITS == 16
3189 AssertFailed(); /** @todo 16-bit */
3190# endif
3191 return *pu32;
3192#endif
3193}
3194
3195
3196/**
3197 * Atomically reads an unsigned 32-bit value, unordered.
3198 *
3199 * @returns Current *pu32 value
3200 * @param pu32 Pointer to the 32-bit variable to read.
3201 */
3202DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3203{
3204 Assert(!((uintptr_t)pu32 & 3));
3205#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3206 uint32_t u32;
3207 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3208# if defined(RT_ARCH_ARM64)
3209 "ldur %w[uDst], %[pMem]\n\t"
3210# else
3211 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3212# endif
3213 : [uDst] "=&r" (u32)
3214 : [pMem] "Q" (*pu32));
3215 return u32;
3216#else
3217# if ARCH_BITS == 16
3218 AssertFailed(); /** @todo 16-bit */
3219# endif
3220 return *pu32;
3221#endif
3222}
3223
3224
3225/**
3226 * Atomically reads a signed 32-bit value, ordered.
3227 *
3228 * @returns Current *pi32 value
3229 * @param pi32 Pointer to the 32-bit variable to read.
3230 */
3231DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3232{
3233 Assert(!((uintptr_t)pi32 & 3));
3234#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3235 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3236#else
3237 ASMMemoryFence();
3238# if ARCH_BITS == 16
3239 AssertFailed(); /** @todo 16-bit */
3240# endif
3241 return *pi32;
3242#endif
3243}
3244
3245
3246/**
3247 * Atomically reads a signed 32-bit value, unordered.
3248 *
3249 * @returns Current *pi32 value
3250 * @param pi32 Pointer to the 32-bit variable to read.
3251 */
3252DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3253{
3254 Assert(!((uintptr_t)pi32 & 3));
3255#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3256 int32_t i32;
3257 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3258# if defined(RT_ARCH_ARM64)
3259 "ldur %w[iDst], %[pMem]\n\t"
3260# else
3261 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3262# endif
3263 : [iDst] "=&r" (i32)
3264 : [pMem] "Q" (*pi32));
3265 return i32;
3266
3267#else
3268# if ARCH_BITS == 16
3269 AssertFailed(); /** @todo 16-bit */
3270# endif
3271 return *pi32;
3272#endif
3273}
3274
3275
3276/**
3277 * Atomically reads an unsigned 64-bit value, ordered.
3278 *
3279 * @returns Current *pu64 value
3280 * @param pu64 Pointer to the 64-bit variable to read.
3281 * The memory pointed to must be writable.
3282 *
3283 * @remarks This may fault if the memory is read-only!
3284 * @remarks x86: Requires a Pentium or later.
3285 */
3286#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3287 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3288RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3289#else
3290DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3291{
3292 uint64_t u64;
3293# ifdef RT_ARCH_AMD64
3294 Assert(!((uintptr_t)pu64 & 7));
3295/*# if RT_INLINE_ASM_GNU_STYLE
3296 __asm__ __volatile__( "mfence\n\t"
3297 "movq %1, %0\n\t"
3298 : "=r" (u64)
3299 : "m" (*pu64));
3300# else
3301 __asm
3302 {
3303 mfence
3304 mov rdx, [pu64]
3305 mov rax, [rdx]
3306 mov [u64], rax
3307 }
3308# endif*/
3309 ASMMemoryFence();
3310 u64 = *pu64;
3311
3312# elif defined(RT_ARCH_X86)
3313# if RT_INLINE_ASM_GNU_STYLE
3314# if defined(PIC) || defined(__PIC__)
3315 uint32_t u32EBX = 0;
3316 Assert(!((uintptr_t)pu64 & 7));
3317 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3318 "lock; cmpxchg8b (%5)\n\t"
3319 "movl %3, %%ebx\n\t"
3320 : "=A" (u64)
3321# if RT_GNUC_PREREQ(4, 3)
3322 , "+m" (*pu64)
3323# else
3324 , "=m" (*pu64)
3325# endif
3326 : "0" (0ULL)
3327 , "m" (u32EBX)
3328 , "c" (0)
3329 , "S" (pu64)
3330 : "cc");
3331# else /* !PIC */
3332 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3333 : "=A" (u64)
3334 , "+m" (*pu64)
3335 : "0" (0ULL)
3336 , "b" (0)
3337 , "c" (0)
3338 : "cc");
3339# endif
3340# else
3341 Assert(!((uintptr_t)pu64 & 7));
3342 __asm
3343 {
3344 xor eax, eax
3345 xor edx, edx
3346 mov edi, pu64
3347 xor ecx, ecx
3348 xor ebx, ebx
3349 lock cmpxchg8b [edi]
3350 mov dword ptr [u64], eax
3351 mov dword ptr [u64 + 4], edx
3352 }
3353# endif
3354
3355# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3356 Assert(!((uintptr_t)pu64 & 7));
3357
3358# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3359 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3360 RTASM_ARM_DMB_SY
3361 "casa %[uDst], xzr, %[pMem]\n\t"
3362 : [uDst] "=&r" (u64)
3363 : [pMem] "Q" (*pu64),
3364 "0" (0)
3365 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3366# else
3367 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3368 RTASM_ARM_DMB_SY
3369# if defined(RT_ARCH_ARM64)
3370# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3371 "ldur %[uDst], %[pMem]\n\t"
3372# else
3373 "ldxr %[uDst], %[pMem]\n\t"
3374 "clrex\n\t"
3375# endif
3376# else
3377 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3378 /** @todo clrex */
3379# endif
3380 : [uDst] "=&r" (u64)
3381 : [pMem] "Q" (*pu64)
3382 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3383# endif
3384# else
3385# error "Port me"
3386# endif
3387 return u64;
3388}
3389#endif
3390
3391
3392/**
3393 * Atomically reads an unsigned 64-bit value, unordered.
3394 *
3395 * @returns Current *pu64 value
3396 * @param pu64 Pointer to the 64-bit variable to read.
3397 * The memory pointed to must be writable.
3398 *
3399 * @remarks This may fault if the memory is read-only!
3400 * @remarks x86: Requires a Pentium or later.
3401 */
3402#if !defined(RT_ARCH_AMD64) \
3403 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3404 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3405RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3406#else
3407DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3408{
3409 uint64_t u64;
3410# ifdef RT_ARCH_AMD64
3411 Assert(!((uintptr_t)pu64 & 7));
3412/*# if RT_INLINE_ASM_GNU_STYLE
3413 Assert(!((uintptr_t)pu64 & 7));
3414 __asm__ __volatile__("movq %1, %0\n\t"
3415 : "=r" (u64)
3416 : "m" (*pu64));
3417# else
3418 __asm
3419 {
3420 mov rdx, [pu64]
3421 mov rax, [rdx]
3422 mov [u64], rax
3423 }
3424# endif */
3425 u64 = *pu64;
3426
3427# elif defined(RT_ARCH_X86)
3428# if RT_INLINE_ASM_GNU_STYLE
3429# if defined(PIC) || defined(__PIC__)
3430 uint32_t u32EBX = 0;
3431 uint32_t u32Spill;
3432 Assert(!((uintptr_t)pu64 & 7));
3433 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3434 "xor %%ecx,%%ecx\n\t"
3435 "xor %%edx,%%edx\n\t"
3436 "xchgl %%ebx, %3\n\t"
3437 "lock; cmpxchg8b (%4)\n\t"
3438 "movl %3, %%ebx\n\t"
3439 : "=A" (u64)
3440# if RT_GNUC_PREREQ(4, 3)
3441 , "+m" (*pu64)
3442# else
3443 , "=m" (*pu64)
3444# endif
3445 , "=c" (u32Spill)
3446 : "m" (u32EBX)
3447 , "S" (pu64)
3448 : "cc");
3449# else /* !PIC */
3450 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3451 : "=A" (u64)
3452 , "+m" (*pu64)
3453 : "0" (0ULL)
3454 , "b" (0)
3455 , "c" (0)
3456 : "cc");
3457# endif
3458# else
3459 Assert(!((uintptr_t)pu64 & 7));
3460 __asm
3461 {
3462 xor eax, eax
3463 xor edx, edx
3464 mov edi, pu64
3465 xor ecx, ecx
3466 xor ebx, ebx
3467 lock cmpxchg8b [edi]
3468 mov dword ptr [u64], eax
3469 mov dword ptr [u64 + 4], edx
3470 }
3471# endif
3472
3473# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3474 Assert(!((uintptr_t)pu64 & 7));
3475 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3476# if defined(RT_ARCH_ARM64)
3477 "ldur %[uDst], %[pMem]\n\t"
3478# else
3479 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3480 /** @todo clrex? */
3481# endif
3482 : [uDst] "=&r" (u64)
3483 : [pMem] "Q" (*pu64));
3484
3485# else
3486# error "Port me"
3487# endif
3488 return u64;
3489}
3490#endif
3491
3492
3493/**
3494 * Atomically reads a signed 64-bit value, ordered.
3495 *
3496 * @returns Current *pi64 value
3497 * @param pi64 Pointer to the 64-bit variable to read.
3498 * The memory pointed to must be writable.
3499 *
3500 * @remarks This may fault if the memory is read-only!
3501 * @remarks x86: Requires a Pentium or later.
3502 */
3503DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3504{
3505 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3506}
3507
3508
3509/**
3510 * Atomically reads a signed 64-bit value, unordered.
3511 *
3512 * @returns Current *pi64 value
3513 * @param pi64 Pointer to the 64-bit variable to read.
3514 * The memory pointed to must be writable.
3515 *
3516 * @remarks This will fault if the memory is read-only!
3517 * @remarks x86: Requires a Pentium or later.
3518 */
3519DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3520{
3521 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3522}
3523
3524
3525/** @def RTASM_HAVE_READ_U128
3526 * Defined in the target architecture supports atomic reading of 128-bit
3527 * integers.
3528 *
3529 * The define value is zero if both ordered and unordered reads are implemented
3530 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered reads are done natively
3531 * w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
3532 *
3533 * @note AMD64: Caller must check for cmpxchg16b support before use and make
3534 * sure variables are writable (won't be changed).
3535 * @sa RTASM_HAVE_CMP_XCHG_U128, RTASM_HAVE_WRITE_U128
3536 */
3537#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
3538# define RTASM_HAVE_READ_U128 3
3539#elif defined(RTASM_HAVE_CMP_XCHG_U128)
3540# define RTASM_HAVE_READ_U128 0
3541#endif
3542
3543#ifdef RTASM_HAVE_READ_U128
3544
3545/**
3546 * Atomically reads an unsigned 128-bit value, ordered.
3547 *
3548 * @returns Current *pu128 value
3549 * @param pu128 Pointer to the 128-bit variable to read.
3550 * The memory pointed to must be writable.
3551 *
3552 * @remarks AMD64: Requires the memory to be both readable and writable.
3553 * @remarks AMD64: Requires support for cmpxchg16b.
3554 */
3555DECLINLINE(uint128_t) ASMAtomicReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3556{
3557 RTUINT128U u128Ret;
3558 Assert(!((uintptr_t)pu128 & 15));
3559# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3560 __asm__ __volatile__("Lstart_ASMAtomicReadU128_%=:\n\t"
3561 RTASM_ARM_DMB_SY
3562 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3563 RTASM_ARM_DMB_SY
3564 : [uRetHi] "=r" (u128Ret.s.Hi)
3565 , [uRetLo] "=r" (u128Ret.s.Lo)
3566 : [pMem] "Q" (*pu128)
3567 : );
3568 return u128Ret.u;
3569# else
3570 ASMAtomicCmpXchgU128v2(pu128, 0, 0, 0, 0, &u128Ret.u);
3571 return u128Ret.u;
3572# endif
3573}
3574
3575/**
3576 * Atomically reads an unsigned 128-bit value, ordered.
3577 *
3578 * @returns Current *pu128 value
3579 * @param pu128 Pointer to the 128-bit variable to read.
3580 * The memory pointed to must be writable.
3581 *
3582 * @remarks AMD64: Requires the memory to be both readable and writable.
3583 * @remarks AMD64: Requires support for cmpxchg16b.
3584 */
3585DECLINLINE(RTUINT128U) ASMAtomicReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3586{
3587 RTUINT128U u128Ret;
3588 Assert(!((uintptr_t)pu128 & 15));
3589# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3590 __asm__ __volatile__("Lstart_ASMAtomicReadU128U_%=:\n\t"
3591 RTASM_ARM_DMB_SY
3592 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3593 RTASM_ARM_DMB_SY
3594 : [uRetHi] "=r" (u128Ret.s.Hi)
3595 , [uRetLo] "=r" (u128Ret.s.Lo)
3596 : [pMem] "Q" (*pu128)
3597 : );
3598 return u128Ret;
3599# else
3600 ASMAtomicCmpXchgU128v2(&pu128->u, 0, 0, 0, 0, &u128Ret.u);
3601 return u128Ret;
3602# endif
3603}
3604
3605
3606/**
3607 * Atomically reads an unsigned 128-bit value, unordered.
3608 *
3609 * @returns Current *pu128 value
3610 * @param pu128 Pointer to the 128-bit variable to read.
3611 * The memory pointed to must be writable.
3612 *
3613 * @remarks AMD64: Requires the memory to be both readable and writable.
3614 * @remarks AMD64: Requires support for cmpxchg16b.
3615 * @remarks AMD64: Is ordered.
3616 */
3617DECLINLINE(uint128_t) ASMAtomicUoReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3618{
3619 Assert(!((uintptr_t)pu128 & 15));
3620# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3621 RTUINT128U u128Ret;
3622 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128_%=:\n\t"
3623 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3624 : [uRetHi] "=r" (u128Ret.s.Hi)
3625 , [uRetLo] "=r" (u128Ret.s.Lo)
3626 : [pMem] "Q" (*pu128)
3627 : );
3628 return u128Ret.u;
3629
3630# elif defined(RT_ARCH_AMD64) && 0
3631 /* This doesn't work because __m128i can't be made volatile and we're not
3632 able to force MSC (2019) to emit _mm_load_si128 (besides it emits movdqu
3633 instead of movdqa). */
3634 __m128i uTmpSse = _mm_load_si128((__m128i volatile *)pu128);
3635 __m128i uTmpSseHi = _mm_srli_si128(uTmpSse, 64 / 8);
3636 RTUINT128U u128Ret;
3637 u128Ret.s.Lo = (uint64_t)_mm_cvtsi128_si64(uTmpSse);
3638 u128Ret.s.Hi = (uint64_t)_mm_cvtsi128_si64(uTmpSseHi);
3639 return u128Ret.u;
3640
3641# else
3642 return ASMAtomicReadU128(pu128);
3643# endif
3644}
3645
3646/**
3647 * Atomically reads an unsigned 128-bit value, unordered.
3648 *
3649 * @returns Current *pu128 value
3650 * @param pu128 Pointer to the 128-bit variable to read.
3651 * The memory pointed to must be writable.
3652 *
3653 * @remarks AMD64: Requires the memory to be both readable and writable.
3654 * @remarks AMD64: Requires support for cmpxchg16b.
3655 * @remarks AMD64: Is ordered.
3656 */
3657DECLINLINE(RTUINT128U) ASMAtomicUoReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3658{
3659 Assert(!((uintptr_t)pu128 & 15));
3660# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3661 RTUINT128U u128Ret;
3662 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128U_%=:\n\t"
3663 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3664 : [uRetHi] "=r" (u128Ret.s.Hi)
3665 , [uRetLo] "=r" (u128Ret.s.Lo)
3666 : [pMem] "Q" (*pu128)
3667 : );
3668 return u128Ret;
3669# else
3670 return ASMAtomicReadU128U(pu128);
3671# endif
3672}
3673
3674#endif /* RTASM_HAVE_READ_U128 */
3675
3676/**
3677 * Atomically reads a size_t value, ordered.
3678 *
3679 * @returns Current *pcb value
3680 * @param pcb Pointer to the size_t variable to read.
3681 */
3682DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3683{
3684#if ARCH_BITS == 64
3685 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3686#elif ARCH_BITS == 32
3687 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3688#elif ARCH_BITS == 16
3689 AssertCompileSize(size_t, 2);
3690 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3691#else
3692# error "Unsupported ARCH_BITS value"
3693#endif
3694}
3695
3696
3697/**
3698 * Atomically reads a size_t value, unordered.
3699 *
3700 * @returns Current *pcb value
3701 * @param pcb Pointer to the size_t variable to read.
3702 */
3703DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3704{
3705#if ARCH_BITS == 64 || ARCH_BITS == 16
3706 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3707#elif ARCH_BITS == 32
3708 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3709#elif ARCH_BITS == 16
3710 AssertCompileSize(size_t, 2);
3711 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3712#else
3713# error "Unsupported ARCH_BITS value"
3714#endif
3715}
3716
3717
3718/**
3719 * Atomically reads a pointer value, ordered.
3720 *
3721 * @returns Current *pv value
3722 * @param ppv Pointer to the pointer variable to read.
3723 *
3724 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3725 * requires less typing (no casts).
3726 */
3727DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3728{
3729#if ARCH_BITS == 32 || ARCH_BITS == 16
3730 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3731#elif ARCH_BITS == 64
3732 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3733#else
3734# error "ARCH_BITS is bogus"
3735#endif
3736}
3737
3738/**
3739 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3740 *
3741 * @returns Current *pv value
3742 * @param ppv Pointer to the pointer variable to read.
3743 * @param Type The type of *ppv, sans volatile.
3744 */
3745#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3746# define ASMAtomicReadPtrT(ppv, Type) \
3747 __extension__ \
3748 ({\
3749 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
3750 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
3751 pvTypeChecked; \
3752 })
3753#else
3754# define ASMAtomicReadPtrT(ppv, Type) \
3755 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3756#endif
3757
3758
3759/**
3760 * Atomically reads a pointer value, unordered.
3761 *
3762 * @returns Current *pv value
3763 * @param ppv Pointer to the pointer variable to read.
3764 *
3765 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
3766 * requires less typing (no casts).
3767 */
3768DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3769{
3770#if ARCH_BITS == 32 || ARCH_BITS == 16
3771 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3772#elif ARCH_BITS == 64
3773 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3774#else
3775# error "ARCH_BITS is bogus"
3776#endif
3777}
3778
3779
3780/**
3781 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
3782 *
3783 * @returns Current *pv value
3784 * @param ppv Pointer to the pointer variable to read.
3785 * @param Type The type of *ppv, sans volatile.
3786 */
3787#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
3788# define ASMAtomicUoReadPtrT(ppv, Type) \
3789 __extension__ \
3790 ({\
3791 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
3792 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
3793 pvTypeChecked; \
3794 })
3795#else
3796# define ASMAtomicUoReadPtrT(ppv, Type) \
3797 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
3798#endif
3799
3800
3801/**
3802 * Atomically reads a boolean value, ordered.
3803 *
3804 * @returns Current *pf value
3805 * @param pf Pointer to the boolean variable to read.
3806 */
3807DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3808{
3809 ASMMemoryFence();
3810 return *pf; /* byte reads are atomic on x86 */
3811}
3812
3813
3814/**
3815 * Atomically reads a boolean value, unordered.
3816 *
3817 * @returns Current *pf value
3818 * @param pf Pointer to the boolean variable to read.
3819 */
3820DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
3821{
3822 return *pf; /* byte reads are atomic on x86 */
3823}
3824
3825
3826/**
3827 * Atomically read a typical IPRT handle value, ordered.
3828 *
3829 * @param ph Pointer to the handle variable to read.
3830 * @param phRes Where to store the result.
3831 *
3832 * @remarks This doesn't currently work for all handles (like RTFILE).
3833 */
3834#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3835# define ASMAtomicReadHandle(ph, phRes) \
3836 do { \
3837 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3838 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3839 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
3840 } while (0)
3841#elif HC_ARCH_BITS == 64
3842# define ASMAtomicReadHandle(ph, phRes) \
3843 do { \
3844 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3845 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3846 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
3847 } while (0)
3848#else
3849# error HC_ARCH_BITS
3850#endif
3851
3852
3853/**
3854 * Atomically read a typical IPRT handle value, unordered.
3855 *
3856 * @param ph Pointer to the handle variable to read.
3857 * @param phRes Where to store the result.
3858 *
3859 * @remarks This doesn't currently work for all handles (like RTFILE).
3860 */
3861#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
3862# define ASMAtomicUoReadHandle(ph, phRes) \
3863 do { \
3864 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
3865 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
3866 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
3867 } while (0)
3868#elif HC_ARCH_BITS == 64
3869# define ASMAtomicUoReadHandle(ph, phRes) \
3870 do { \
3871 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
3872 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
3873 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
3874 } while (0)
3875#else
3876# error HC_ARCH_BITS
3877#endif
3878
3879
3880/**
3881 * Atomically read a value which size might differ
3882 * between platforms or compilers, ordered.
3883 *
3884 * @param pu Pointer to the variable to read.
3885 * @param puRes Where to store the result.
3886 */
3887#define ASMAtomicReadSize(pu, puRes) \
3888 do { \
3889 switch (sizeof(*(pu))) { \
3890 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3891 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3892 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3893 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3894 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3895 } \
3896 } while (0)
3897
3898
3899/**
3900 * Atomically read a value which size might differ
3901 * between platforms or compilers, unordered.
3902 *
3903 * @param pu Pointer to the variable to read.
3904 * @param puRes Where to store the result.
3905 */
3906#define ASMAtomicUoReadSize(pu, puRes) \
3907 do { \
3908 switch (sizeof(*(pu))) { \
3909 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3910 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3911 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3912 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
3913 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
3914 } \
3915 } while (0)
3916
3917
3918/**
3919 * Atomically writes an unsigned 8-bit value, ordered.
3920 *
3921 * @param pu8 Pointer to the 8-bit variable.
3922 * @param u8 The 8-bit value to assign to *pu8.
3923 */
3924DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3925{
3926#if defined(RT_ARCH_ARM64)
3927 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
3928 as all byte accesses are single-copy atomic, which I think suffices here. */
3929 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
3930# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
3931 RTASM_ARM_DMB_SY
3932 "swpb %w[uValue], wzr, %[pMem]\n\t"
3933# else
3934 RTASM_ARM_DMB_SY
3935 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
3936# endif
3937 : [pMem] "+Q" (*pu8)
3938 : [uValue] "r" ((uint32_t)u8)
3939 : );
3940#else
3941 ASMAtomicXchgU8(pu8, u8);
3942#endif
3943}
3944
3945
3946/**
3947 * Atomically writes an unsigned 8-bit value, unordered.
3948 *
3949 * @param pu8 Pointer to the 8-bit variable.
3950 * @param u8 The 8-bit value to assign to *pu8.
3951 */
3952DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
3953{
3954 *pu8 = u8; /* byte writes are atomic on x86 */
3955}
3956
3957
3958/**
3959 * Atomically writes a signed 8-bit value, ordered.
3960 *
3961 * @param pi8 Pointer to the 8-bit variable to read.
3962 * @param i8 The 8-bit value to assign to *pi8.
3963 */
3964DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3965{
3966#if defined(RT_ARCH_ARM64)
3967 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
3968#else
3969 ASMAtomicXchgS8(pi8, i8);
3970#endif
3971}
3972
3973
3974/**
3975 * Atomically writes a signed 8-bit value, unordered.
3976 *
3977 * @param pi8 Pointer to the 8-bit variable to write.
3978 * @param i8 The 8-bit value to assign to *pi8.
3979 */
3980DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
3981{
3982 *pi8 = i8; /* byte writes are atomic on x86 */
3983}
3984
3985
3986/**
3987 * Atomically writes an unsigned 16-bit value, ordered.
3988 *
3989 * @param pu16 Pointer to the 16-bit variable to write.
3990 * @param u16 The 16-bit value to assign to *pu16.
3991 */
3992DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
3993{
3994#if defined(RT_ARCH_ARM64)
3995 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
3996# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
3997 RTASM_ARM_DMB_SY
3998 "swph %w[uValue], wzr, %[pMem]\n\t"
3999# else
4000 RTASM_ARM_DMB_SY
4001 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4002# endif
4003 : [pMem] "+Q" (*pu16)
4004 : [uValue] "r" ((uint32_t)u16)
4005 : );
4006#else
4007 ASMAtomicXchgU16(pu16, u16);
4008#endif
4009}
4010
4011
4012/**
4013 * Atomically writes an unsigned 16-bit value, unordered.
4014 *
4015 * @param pu16 Pointer to the 16-bit variable to write.
4016 * @param u16 The 16-bit value to assign to *pu16.
4017 */
4018DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4019{
4020 Assert(!((uintptr_t)pu16 & 1));
4021 *pu16 = u16;
4022}
4023
4024
4025/**
4026 * Atomically writes a signed 16-bit value, ordered.
4027 *
4028 * @param pi16 Pointer to the 16-bit variable to write.
4029 * @param i16 The 16-bit value to assign to *pi16.
4030 */
4031DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4032{
4033#if defined(RT_ARCH_ARM64)
4034 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
4035#else
4036 ASMAtomicXchgS16(pi16, i16);
4037#endif
4038}
4039
4040
4041/**
4042 * Atomically writes a signed 16-bit value, unordered.
4043 *
4044 * @param pi16 Pointer to the 16-bit variable to write.
4045 * @param i16 The 16-bit value to assign to *pi16.
4046 */
4047DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4048{
4049 Assert(!((uintptr_t)pi16 & 1));
4050 *pi16 = i16;
4051}
4052
4053
4054/**
4055 * Atomically writes an unsigned 32-bit value, ordered.
4056 *
4057 * @param pu32 Pointer to the 32-bit variable to write.
4058 * @param u32 The 32-bit value to assign to *pu32.
4059 */
4060DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4061{
4062#if defined(RT_ARCH_ARM64)
4063 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
4064# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4065 RTASM_ARM_DMB_SY
4066 "swp %w[uValue], wzr, %[pMem]\n\t"
4067# else
4068 RTASM_ARM_DMB_SY
4069 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4070# endif
4071 : [pMem] "+Q" (*pu32)
4072 : [uValue] "r" (u32)
4073 : "cc");
4074#else
4075 ASMAtomicXchgU32(pu32, u32);
4076#endif
4077}
4078
4079
4080/**
4081 * Atomically writes an unsigned 32-bit value, unordered.
4082 *
4083 * @param pu32 Pointer to the 32-bit variable to write.
4084 * @param u32 The 32-bit value to assign to *pu32.
4085 */
4086DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4087{
4088 Assert(!((uintptr_t)pu32 & 3));
4089#if ARCH_BITS >= 32
4090 *pu32 = u32;
4091#else
4092 ASMAtomicXchgU32(pu32, u32);
4093#endif
4094}
4095
4096
4097/**
4098 * Atomically writes a signed 32-bit value, ordered.
4099 *
4100 * @param pi32 Pointer to the 32-bit variable to write.
4101 * @param i32 The 32-bit value to assign to *pi32.
4102 */
4103DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4104{
4105#if defined(RT_ARCH_ARM64)
4106 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
4107#else
4108 ASMAtomicXchgS32(pi32, i32);
4109#endif
4110}
4111
4112
4113/**
4114 * Atomically writes a signed 32-bit value, unordered.
4115 *
4116 * @param pi32 Pointer to the 32-bit variable to write.
4117 * @param i32 The 32-bit value to assign to *pi32.
4118 */
4119DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4120{
4121 Assert(!((uintptr_t)pi32 & 3));
4122#if ARCH_BITS >= 32
4123 *pi32 = i32;
4124#else
4125 ASMAtomicXchgS32(pi32, i32);
4126#endif
4127}
4128
4129
4130/**
4131 * Atomically writes an unsigned 64-bit value, ordered.
4132 *
4133 * @param pu64 Pointer to the 64-bit variable to write.
4134 * @param u64 The 64-bit value to assign to *pu64.
4135 */
4136DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4137{
4138#if defined(RT_ARCH_ARM64)
4139 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
4140# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4141 RTASM_ARM_DMB_SY
4142 "swp %[uValue], xzr, %[pMem]\n\t"
4143# else
4144 RTASM_ARM_DMB_SY /** @todo necessary? */
4145 "stlr %[uValue], %[pMem]\n\t"
4146# endif
4147 : [pMem] "+Q" (*pu64)
4148 : [uValue] "r" (u64)
4149 : );
4150#else
4151 ASMAtomicXchgU64(pu64, u64);
4152#endif
4153}
4154
4155
4156/**
4157 * Atomically writes an unsigned 64-bit value, unordered.
4158 *
4159 * @param pu64 Pointer to the 64-bit variable to write.
4160 * @param u64 The 64-bit value to assign to *pu64.
4161 */
4162DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4163{
4164 Assert(!((uintptr_t)pu64 & 7));
4165#if ARCH_BITS == 64
4166 *pu64 = u64;
4167#else
4168 ASMAtomicXchgU64(pu64, u64);
4169#endif
4170}
4171
4172
4173/**
4174 * Atomically writes a signed 64-bit value, ordered.
4175 *
4176 * @param pi64 Pointer to the 64-bit variable to write.
4177 * @param i64 The 64-bit value to assign to *pi64.
4178 */
4179DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4180{
4181#if defined(RT_ARCH_ARM64)
4182 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4183#else
4184 ASMAtomicXchgS64(pi64, i64);
4185#endif
4186}
4187
4188
4189/**
4190 * Atomically writes a signed 64-bit value, unordered.
4191 *
4192 * @param pi64 Pointer to the 64-bit variable to write.
4193 * @param i64 The 64-bit value to assign to *pi64.
4194 */
4195DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4196{
4197 Assert(!((uintptr_t)pi64 & 7));
4198#if ARCH_BITS == 64
4199 *pi64 = i64;
4200#else
4201 ASMAtomicXchgS64(pi64, i64);
4202#endif
4203}
4204
4205
4206/** @def RTASM_HAVE_WRITE_U128
4207 * Defined in the target architecture supports atomic of 128-bit integers.
4208 *
4209 * The define value is zero if both ordered and unordered writes are implemented
4210 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered writes are done
4211 * natively w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
4212 *
4213 * @note AMD64: Caller must check for cmpxchg16b support before use.
4214 * @sa RTASM_HAVE_CMP_XCHG_U128
4215 */
4216#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
4217# define RTASM_HAVE_WRITE_U128 3
4218#elif defined(RTASM_HAVE_CMP_XCHG_U128)
4219# define RTASM_HAVE_WRITE_U128 0
4220#endif
4221
4222#ifdef RTASM_HAVE_WRITE_U128
4223
4224/**
4225 * Atomically writes an unsigned 128-bit value, ordered.
4226 *
4227 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4228 * on 16 byte boundrary.
4229 * @param u64Hi The high 64 bits of the new value.
4230 * @param u64Lo The low 64 bits of the new value.
4231 */
4232DECLINLINE(void) ASMAtomicWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4233{
4234# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4235 RTUINT128U u128Old;
4236# endif
4237 Assert(!((uintptr_t)pu128 & 15));
4238# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4239 __asm__ __volatile__("Lstart_ASMAtomicWriteU128v2_%=:\n\t"
4240# if 0 && defined(RTASM_ARM64_USE_FEAT_LSE128) /** @todo hw support? test + debug */
4241 RTASM_ARM_DMB_SY
4242 "swpp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4243# else
4244 RTASM_ARM_DMB_SY
4245 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4246 "dmb sy\n\t"
4247# endif
4248 : [pMem] "+Q" (*pu128)
4249 : [uValueHi] "r" (u64Hi)
4250 , [uValueLo] "r" (u64Lo)
4251 : );
4252
4253# else
4254# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4255 u128Old.u = *pu128;
4256# else
4257 u128Old.u.Lo = pu128->Lo;
4258 u128Old.u.Hi = pu128->Hi;
4259# endif
4260 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4261 { }
4262# endif
4263}
4264
4265
4266/**
4267 * Atomically writes an unsigned 128-bit value, ordered.
4268 *
4269 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4270 * on 16 byte boundrary.
4271 * @param u64Hi The high 64 bits of the new value.
4272 * @param u64Lo The low 64 bits of the new value.
4273 * @note This is ordered on AMD64.
4274 */
4275DECLINLINE(void) ASMAtomicUoWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4276{
4277# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4278 RTUINT128U u128Old;
4279# endif
4280 Assert(!((uintptr_t)pu128 & 15));
4281# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4282 __asm__ __volatile__("Lstart_ASMAtomicUoWriteU128v2_%=:\n\t"
4283 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4284 : [pMem] "+Q" (*pu128)
4285 : [uValueHi] "r" (u64Hi)
4286 , [uValueLo] "r" (u64Lo)
4287 : );
4288
4289# else
4290# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4291 u128Old.u = *pu128;
4292# else
4293 u128Old.u.Lo = pu128->Lo;
4294 u128Old.u.Hi = pu128->Hi;
4295# endif
4296 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4297 { }
4298# endif
4299}
4300
4301
4302/**
4303 * Atomically writes an unsigned 128-bit value, ordered.
4304 *
4305 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4306 * on 16 byte boundrary.
4307 * @param u128 The the new value.
4308 */
4309DECLINLINE(void) ASMAtomicWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4310{
4311# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4312 ASMAtomicWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4313# else
4314 ASMAtomicWriteU128v2(pu128, u128.Hi, u128.Lo);
4315# endif
4316}
4317
4318
4319/**
4320 * Atomically writes an unsigned 128-bit value, unordered.
4321 *
4322 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4323 * on 16 byte boundrary.
4324 * @param u128 The the new value.
4325 * @note This is ordered on AMD64.
4326 */
4327DECLINLINE(void) ASMAtomicUoWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4328{
4329# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4330 ASMAtomicUoWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4331# else
4332 ASMAtomicUoWriteU128v2(pu128, u128.Hi, u128.Lo);
4333# endif
4334}
4335
4336
4337/**
4338 * Atomically writes an unsigned 128-bit value, ordered.
4339 *
4340 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4341 * on 16 byte boundrary.
4342 * @param u128 The the new value.
4343 */
4344DECLINLINE(void) ASMAtomicWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4345{
4346 ASMAtomicWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4347}
4348
4349
4350/**
4351 * Atomically writes an unsigned 128-bit value, unordered.
4352 *
4353 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4354 * on 16 byte boundrary.
4355 * @param u128 The the new value.
4356 * @note This is ordered on AMD64.
4357 */
4358DECLINLINE(void) ASMAtomicUoWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4359{
4360 ASMAtomicUoWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4361}
4362
4363#endif /* RTASM_HAVE_WRITE_U128 */
4364
4365/**
4366 * Atomically writes a size_t value, ordered.
4367 *
4368 * @param pcb Pointer to the size_t variable to write.
4369 * @param cb The value to assign to *pcb.
4370 */
4371DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4372{
4373#if ARCH_BITS == 64
4374 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4375#elif ARCH_BITS == 32
4376 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4377#elif ARCH_BITS == 16
4378 AssertCompileSize(size_t, 2);
4379 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4380#else
4381# error "Unsupported ARCH_BITS value"
4382#endif
4383}
4384
4385
4386/**
4387 * Atomically writes a size_t value, unordered.
4388 *
4389 * @param pcb Pointer to the size_t variable to write.
4390 * @param cb The value to assign to *pcb.
4391 */
4392DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4393{
4394#if ARCH_BITS == 64
4395 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4396#elif ARCH_BITS == 32
4397 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4398#elif ARCH_BITS == 16
4399 AssertCompileSize(size_t, 2);
4400 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4401#else
4402# error "Unsupported ARCH_BITS value"
4403#endif
4404}
4405
4406
4407/**
4408 * Atomically writes a boolean value, unordered.
4409 *
4410 * @param pf Pointer to the boolean variable to write.
4411 * @param f The boolean value to assign to *pf.
4412 */
4413DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4414{
4415 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4416}
4417
4418
4419/**
4420 * Atomically writes a boolean value, unordered.
4421 *
4422 * @param pf Pointer to the boolean variable to write.
4423 * @param f The boolean value to assign to *pf.
4424 */
4425DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4426{
4427 *pf = f; /* byte writes are atomic on x86 */
4428}
4429
4430
4431/**
4432 * Atomically writes a pointer value, ordered.
4433 *
4434 * @param ppv Pointer to the pointer variable to write.
4435 * @param pv The pointer value to assign to *ppv.
4436 */
4437DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4438{
4439#if ARCH_BITS == 32 || ARCH_BITS == 16
4440 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4441#elif ARCH_BITS == 64
4442 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4443#else
4444# error "ARCH_BITS is bogus"
4445#endif
4446}
4447
4448
4449/**
4450 * Atomically writes a pointer value, unordered.
4451 *
4452 * @param ppv Pointer to the pointer variable to write.
4453 * @param pv The pointer value to assign to *ppv.
4454 */
4455DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4456{
4457#if ARCH_BITS == 32 || ARCH_BITS == 16
4458 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4459#elif ARCH_BITS == 64
4460 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4461#else
4462# error "ARCH_BITS is bogus"
4463#endif
4464}
4465
4466
4467/**
4468 * Atomically writes a pointer value, ordered.
4469 *
4470 * @param ppv Pointer to the pointer variable to write.
4471 * @param pv The pointer value to assign to *ppv. If NULL use
4472 * ASMAtomicWriteNullPtr or you'll land in trouble.
4473 *
4474 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4475 * NULL.
4476 */
4477#ifdef __GNUC__
4478# define ASMAtomicWritePtr(ppv, pv) \
4479 do \
4480 { \
4481 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4482 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4483 \
4484 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4485 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4486 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4487 \
4488 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4489 } while (0)
4490#else
4491# define ASMAtomicWritePtr(ppv, pv) \
4492 do \
4493 { \
4494 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4495 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4496 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4497 \
4498 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4499 } while (0)
4500#endif
4501
4502
4503/**
4504 * Atomically sets a pointer to NULL, ordered.
4505 *
4506 * @param ppv Pointer to the pointer variable that should be set to NULL.
4507 *
4508 * @remarks This is relatively type safe on GCC platforms.
4509 */
4510#if RT_GNUC_PREREQ(4, 2)
4511# define ASMAtomicWriteNullPtr(ppv) \
4512 do \
4513 { \
4514 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4515 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4516 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4517 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4518 } while (0)
4519#else
4520# define ASMAtomicWriteNullPtr(ppv) \
4521 do \
4522 { \
4523 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4524 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4525 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4526 } while (0)
4527#endif
4528
4529
4530/**
4531 * Atomically writes a pointer value, unordered.
4532 *
4533 * @returns Current *pv value
4534 * @param ppv Pointer to the pointer variable.
4535 * @param pv The pointer value to assign to *ppv. If NULL use
4536 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4537 *
4538 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4539 * NULL.
4540 */
4541#if RT_GNUC_PREREQ(4, 2)
4542# define ASMAtomicUoWritePtr(ppv, pv) \
4543 do \
4544 { \
4545 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4546 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4547 \
4548 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4549 AssertCompile(sizeof(pv) == sizeof(void *)); \
4550 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4551 \
4552 *(ppvTypeChecked) = pvTypeChecked; \
4553 } while (0)
4554#else
4555# define ASMAtomicUoWritePtr(ppv, pv) \
4556 do \
4557 { \
4558 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4559 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4560 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4561 *(ppv) = pv; \
4562 } while (0)
4563#endif
4564
4565
4566/**
4567 * Atomically sets a pointer to NULL, unordered.
4568 *
4569 * @param ppv Pointer to the pointer variable that should be set to NULL.
4570 *
4571 * @remarks This is relatively type safe on GCC platforms.
4572 */
4573#ifdef __GNUC__
4574# define ASMAtomicUoWriteNullPtr(ppv) \
4575 do \
4576 { \
4577 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4578 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4579 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4580 *(ppvTypeChecked) = NULL; \
4581 } while (0)
4582#else
4583# define ASMAtomicUoWriteNullPtr(ppv) \
4584 do \
4585 { \
4586 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4587 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4588 *(ppv) = NULL; \
4589 } while (0)
4590#endif
4591
4592
4593/**
4594 * Atomically write a typical IPRT handle value, ordered.
4595 *
4596 * @param ph Pointer to the variable to update.
4597 * @param hNew The value to assign to *ph.
4598 *
4599 * @remarks This doesn't currently work for all handles (like RTFILE).
4600 */
4601#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4602# define ASMAtomicWriteHandle(ph, hNew) \
4603 do { \
4604 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4605 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4606 } while (0)
4607#elif HC_ARCH_BITS == 64
4608# define ASMAtomicWriteHandle(ph, hNew) \
4609 do { \
4610 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4611 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4612 } while (0)
4613#else
4614# error HC_ARCH_BITS
4615#endif
4616
4617
4618/**
4619 * Atomically write a typical IPRT handle value, unordered.
4620 *
4621 * @param ph Pointer to the variable to update.
4622 * @param hNew The value to assign to *ph.
4623 *
4624 * @remarks This doesn't currently work for all handles (like RTFILE).
4625 */
4626#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4627# define ASMAtomicUoWriteHandle(ph, hNew) \
4628 do { \
4629 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4630 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4631 } while (0)
4632#elif HC_ARCH_BITS == 64
4633# define ASMAtomicUoWriteHandle(ph, hNew) \
4634 do { \
4635 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4636 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4637 } while (0)
4638#else
4639# error HC_ARCH_BITS
4640#endif
4641
4642
4643/**
4644 * Atomically write a value which size might differ
4645 * between platforms or compilers, ordered.
4646 *
4647 * @param pu Pointer to the variable to update.
4648 * @param uNew The value to assign to *pu.
4649 */
4650#define ASMAtomicWriteSize(pu, uNew) \
4651 do { \
4652 switch (sizeof(*(pu))) { \
4653 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4654 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4655 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4656 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4657 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4658 } \
4659 } while (0)
4660
4661/**
4662 * Atomically write a value which size might differ
4663 * between platforms or compilers, unordered.
4664 *
4665 * @param pu Pointer to the variable to update.
4666 * @param uNew The value to assign to *pu.
4667 */
4668#define ASMAtomicUoWriteSize(pu, uNew) \
4669 do { \
4670 switch (sizeof(*(pu))) { \
4671 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4672 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4673 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4674 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4675 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4676 } \
4677 } while (0)
4678
4679
4680
4681/**
4682 * Atomically exchanges and adds to a 16-bit value, ordered.
4683 *
4684 * @returns The old value.
4685 * @param pu16 Pointer to the value.
4686 * @param u16 Number to add.
4687 *
4688 * @remarks Currently not implemented, just to make 16-bit code happy.
4689 * @remarks x86: Requires a 486 or later.
4690 */
4691RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4692
4693
4694/**
4695 * Atomically exchanges and adds to a 32-bit value, ordered.
4696 *
4697 * @returns The old value.
4698 * @param pu32 Pointer to the value.
4699 * @param u32 Number to add.
4700 *
4701 * @remarks x86: Requires a 486 or later.
4702 */
4703#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4704RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4705#else
4706DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4707{
4708# if RT_INLINE_ASM_USES_INTRIN
4709 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4710 return u32;
4711
4712# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4713# if RT_INLINE_ASM_GNU_STYLE
4714 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
4715 : "=r" (u32)
4716 , "=m" (*pu32)
4717 : "0" (u32)
4718 , "m" (*pu32)
4719 : "memory"
4720 , "cc");
4721 return u32;
4722# else
4723 __asm
4724 {
4725 mov eax, [u32]
4726# ifdef RT_ARCH_AMD64
4727 mov rdx, [pu32]
4728 lock xadd [rdx], eax
4729# else
4730 mov edx, [pu32]
4731 lock xadd [edx], eax
4732# endif
4733 mov [u32], eax
4734 }
4735 return u32;
4736# endif
4737
4738# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4739 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
4740# if defined(RTASM_ARM64_USE_FEAT_LSE)
4741 uint32_t u32OldRet;
4742 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
4743# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4744 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4745# else
4746 RTASM_ARM_DMB_SY
4747 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
4748# endif
4749 : [pMem] "+Q" (*pu32)
4750 , [uOldActual] "=&r" (u32OldRet)
4751 : [uAddend] "r" (u32)
4752 : );
4753# else
4754 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
4755 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
4756 "add %[uNew], %[uOld], %[uVal]\n\t",
4757 [uVal] "r" (u32));
4758# endif
4759 return u32OldRet;
4760
4761# else
4762# error "Port me"
4763# endif
4764}
4765#endif
4766
4767
4768/**
4769 * Atomically exchanges and adds to a signed 32-bit value, ordered.
4770 *
4771 * @returns The old value.
4772 * @param pi32 Pointer to the value.
4773 * @param i32 Number to add.
4774 *
4775 * @remarks x86: Requires a 486 or later.
4776 */
4777DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4778{
4779 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
4780}
4781
4782
4783/**
4784 * Atomically exchanges and adds to a 64-bit value, ordered.
4785 *
4786 * @returns The old value.
4787 * @param pu64 Pointer to the value.
4788 * @param u64 Number to add.
4789 *
4790 * @remarks x86: Requires a Pentium or later.
4791 */
4792#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4793DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
4794#else
4795DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4796{
4797# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
4798 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
4799 return u64;
4800
4801# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
4802 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
4803 : "=r" (u64)
4804 , "=m" (*pu64)
4805 : "0" (u64)
4806 , "m" (*pu64)
4807 : "memory"
4808 , "cc");
4809 return u64;
4810
4811# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
4812# if defined(RTASM_ARM64_USE_FEAT_LSE)
4813 uint64_t u64OldRet;
4814 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
4815# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
4816 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
4817# else
4818 RTASM_ARM_DMB_SY
4819 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
4820# endif
4821 : [pMem] "+Q" (*pu64)
4822 , [uOldActual] "=&r" (u64OldRet)
4823 : [uAddend] "r" (u64)
4824 : );
4825# else
4826 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
4827 "add %[uNew], %[uOld], %[uVal]\n\t"
4828 ,
4829 "add %[uNew], %[uOld], %[uVal]\n\t"
4830 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
4831 [uVal] "r" (u64));
4832# endif
4833 return u64OldRet;
4834
4835# else
4836 uint64_t u64Old;
4837 for (;;)
4838 {
4839 uint64_t u64New;
4840 u64Old = ASMAtomicUoReadU64(pu64);
4841 u64New = u64Old + u64;
4842 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
4843 break;
4844 ASMNopPause();
4845 }
4846 return u64Old;
4847# endif
4848}
4849#endif
4850
4851
4852/**
4853 * Atomically exchanges and adds to a signed 64-bit value, ordered.
4854 *
4855 * @returns The old value.
4856 * @param pi64 Pointer to the value.
4857 * @param i64 Number to add.
4858 *
4859 * @remarks x86: Requires a Pentium or later.
4860 */
4861DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4862{
4863 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
4864}
4865
4866
4867/**
4868 * Atomically exchanges and adds to a size_t value, ordered.
4869 *
4870 * @returns The old value.
4871 * @param pcb Pointer to the size_t value.
4872 * @param cb Number to add.
4873 */
4874DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4875{
4876#if ARCH_BITS == 64
4877 AssertCompileSize(size_t, 8);
4878 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
4879#elif ARCH_BITS == 32
4880 AssertCompileSize(size_t, 4);
4881 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
4882#elif ARCH_BITS == 16
4883 AssertCompileSize(size_t, 2);
4884 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
4885#else
4886# error "Unsupported ARCH_BITS value"
4887#endif
4888}
4889
4890
4891/**
4892 * Atomically exchanges and adds a value which size might differ between
4893 * platforms or compilers, ordered.
4894 *
4895 * @param pu Pointer to the variable to update.
4896 * @param uNew The value to add to *pu.
4897 * @param puOld Where to store the old value.
4898 */
4899#define ASMAtomicAddSize(pu, uNew, puOld) \
4900 do { \
4901 switch (sizeof(*(pu))) { \
4902 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4903 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4904 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
4905 } \
4906 } while (0)
4907
4908
4909
4910/**
4911 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
4912 *
4913 * @returns The old value.
4914 * @param pu16 Pointer to the value.
4915 * @param u16 Number to subtract.
4916 *
4917 * @remarks x86: Requires a 486 or later.
4918 */
4919DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
4920{
4921 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
4922}
4923
4924
4925/**
4926 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
4927 *
4928 * @returns The old value.
4929 * @param pi16 Pointer to the value.
4930 * @param i16 Number to subtract.
4931 *
4932 * @remarks x86: Requires a 486 or later.
4933 */
4934DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4935{
4936 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
4937}
4938
4939
4940/**
4941 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
4942 *
4943 * @returns The old value.
4944 * @param pu32 Pointer to the value.
4945 * @param u32 Number to subtract.
4946 *
4947 * @remarks x86: Requires a 486 or later.
4948 */
4949DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4950{
4951 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
4952}
4953
4954
4955/**
4956 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
4957 *
4958 * @returns The old value.
4959 * @param pi32 Pointer to the value.
4960 * @param i32 Number to subtract.
4961 *
4962 * @remarks x86: Requires a 486 or later.
4963 */
4964DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4965{
4966 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
4967}
4968
4969
4970/**
4971 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
4972 *
4973 * @returns The old value.
4974 * @param pu64 Pointer to the value.
4975 * @param u64 Number to subtract.
4976 *
4977 * @remarks x86: Requires a Pentium or later.
4978 */
4979DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4980{
4981 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
4982}
4983
4984
4985/**
4986 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
4987 *
4988 * @returns The old value.
4989 * @param pi64 Pointer to the value.
4990 * @param i64 Number to subtract.
4991 *
4992 * @remarks x86: Requires a Pentium or later.
4993 */
4994DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4995{
4996 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
4997}
4998
4999
5000/**
5001 * Atomically exchanges and subtracts to a size_t value, ordered.
5002 *
5003 * @returns The old value.
5004 * @param pcb Pointer to the size_t value.
5005 * @param cb Number to subtract.
5006 *
5007 * @remarks x86: Requires a 486 or later.
5008 */
5009DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5010{
5011#if ARCH_BITS == 64
5012 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
5013#elif ARCH_BITS == 32
5014 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
5015#elif ARCH_BITS == 16
5016 AssertCompileSize(size_t, 2);
5017 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
5018#else
5019# error "Unsupported ARCH_BITS value"
5020#endif
5021}
5022
5023
5024/**
5025 * Atomically exchanges and subtracts a value which size might differ between
5026 * platforms or compilers, ordered.
5027 *
5028 * @param pu Pointer to the variable to update.
5029 * @param uNew The value to subtract to *pu.
5030 * @param puOld Where to store the old value.
5031 *
5032 * @remarks x86: Requires a 486 or later.
5033 */
5034#define ASMAtomicSubSize(pu, uNew, puOld) \
5035 do { \
5036 switch (sizeof(*(pu))) { \
5037 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5038 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5039 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
5040 } \
5041 } while (0)
5042
5043
5044
5045/**
5046 * Atomically increment a 16-bit value, ordered.
5047 *
5048 * @returns The new value.
5049 * @param pu16 Pointer to the value to increment.
5050 * @remarks Not implemented. Just to make 16-bit code happy.
5051 *
5052 * @remarks x86: Requires a 486 or later.
5053 */
5054RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5055
5056
5057/**
5058 * Atomically increment a 32-bit value, ordered.
5059 *
5060 * @returns The new value.
5061 * @param pu32 Pointer to the value to increment.
5062 *
5063 * @remarks x86: Requires a 486 or later.
5064 */
5065#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5066RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5067#else
5068DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5069{
5070# if RT_INLINE_ASM_USES_INTRIN
5071 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
5072
5073# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5074# if RT_INLINE_ASM_GNU_STYLE
5075 uint32_t u32;
5076 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5077 : "=r" (u32)
5078 , "=m" (*pu32)
5079 : "0" (1)
5080 , "m" (*pu32)
5081 : "memory"
5082 , "cc");
5083 return u32+1;
5084# else
5085 __asm
5086 {
5087 mov eax, 1
5088# ifdef RT_ARCH_AMD64
5089 mov rdx, [pu32]
5090 lock xadd [rdx], eax
5091# else
5092 mov edx, [pu32]
5093 lock xadd [edx], eax
5094# endif
5095 mov u32, eax
5096 }
5097 return u32+1;
5098# endif
5099
5100# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5101 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
5102# if defined(RTASM_ARM64_USE_FEAT_LSE)
5103 uint32_t u32NewRet;
5104 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
5105# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5106 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5107# else
5108 RTASM_ARM_DMB_SY
5109 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5110# endif
5111 "add %w[uNewRet], %w[uNewRet], #1\n\t"
5112 : [pMem] "+Q" (*pu32)
5113 , [uNewRet] "=&r" (u32NewRet)
5114 : [uAddend] "r" ((uint32_t)1)
5115 : );
5116# else
5117 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
5118 "add %w[uNew], %w[uNew], #1\n\t",
5119 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5120 "X" (0) /* dummy */);
5121# endif
5122 return u32NewRet;
5123
5124# else
5125 return ASMAtomicAddU32(pu32, 1) + 1;
5126# endif
5127}
5128#endif
5129
5130
5131/**
5132 * Atomically increment a signed 32-bit value, ordered.
5133 *
5134 * @returns The new value.
5135 * @param pi32 Pointer to the value to increment.
5136 *
5137 * @remarks x86: Requires a 486 or later.
5138 */
5139DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5140{
5141 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
5142}
5143
5144
5145/**
5146 * Atomically increment a 64-bit value, ordered.
5147 *
5148 * @returns The new value.
5149 * @param pu64 Pointer to the value to increment.
5150 *
5151 * @remarks x86: Requires a Pentium or later.
5152 */
5153#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5154DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5155#else
5156DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5157{
5158# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5159 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
5160
5161# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5162 uint64_t u64;
5163 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5164 : "=r" (u64)
5165 , "=m" (*pu64)
5166 : "0" (1)
5167 , "m" (*pu64)
5168 : "memory"
5169 , "cc");
5170 return u64 + 1;
5171
5172# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5173# if defined(RTASM_ARM64_USE_FEAT_LSE)
5174 uint64_t u64NewRet;
5175 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
5176# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5177 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5178# else
5179 RTASM_ARM_DMB_SY
5180 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5181# endif
5182 "add %[uNewRet], %[uNewRet], #1\n\t"
5183 : [pMem] "+Q" (*pu64)
5184 , [uNewRet] "=&r" (u64NewRet)
5185 : [uAddend] "r" ((uint64_t)1)
5186 : );
5187# else
5188 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
5189 "add %[uNew], %[uNew], #1\n\t"
5190 ,
5191 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5192 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5193 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5194# endif
5195 return u64NewRet;
5196
5197# else
5198 return ASMAtomicAddU64(pu64, 1) + 1;
5199# endif
5200}
5201#endif
5202
5203
5204/**
5205 * Atomically increment a signed 64-bit value, ordered.
5206 *
5207 * @returns The new value.
5208 * @param pi64 Pointer to the value to increment.
5209 *
5210 * @remarks x86: Requires a Pentium or later.
5211 */
5212DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5213{
5214 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
5215}
5216
5217
5218/**
5219 * Atomically increment a size_t value, ordered.
5220 *
5221 * @returns The new value.
5222 * @param pcb Pointer to the value to increment.
5223 *
5224 * @remarks x86: Requires a 486 or later.
5225 */
5226DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5227{
5228#if ARCH_BITS == 64
5229 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
5230#elif ARCH_BITS == 32
5231 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
5232#elif ARCH_BITS == 16
5233 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
5234#else
5235# error "Unsupported ARCH_BITS value"
5236#endif
5237}
5238
5239
5240
5241/**
5242 * Atomically decrement an unsigned 32-bit value, ordered.
5243 *
5244 * @returns The new value.
5245 * @param pu16 Pointer to the value to decrement.
5246 * @remarks Not implemented. Just to make 16-bit code happy.
5247 *
5248 * @remarks x86: Requires a 486 or later.
5249 */
5250RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5251
5252
5253/**
5254 * Atomically decrement an unsigned 32-bit value, ordered.
5255 *
5256 * @returns The new value.
5257 * @param pu32 Pointer to the value to decrement.
5258 *
5259 * @remarks x86: Requires a 486 or later.
5260 */
5261#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5262RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5263#else
5264DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5265{
5266# if RT_INLINE_ASM_USES_INTRIN
5267 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
5268
5269# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5270# if RT_INLINE_ASM_GNU_STYLE
5271 uint32_t u32;
5272 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5273 : "=r" (u32)
5274 , "=m" (*pu32)
5275 : "0" (-1)
5276 , "m" (*pu32)
5277 : "memory"
5278 , "cc");
5279 return u32-1;
5280# else
5281 uint32_t u32;
5282 __asm
5283 {
5284 mov eax, -1
5285# ifdef RT_ARCH_AMD64
5286 mov rdx, [pu32]
5287 lock xadd [rdx], eax
5288# else
5289 mov edx, [pu32]
5290 lock xadd [edx], eax
5291# endif
5292 mov u32, eax
5293 }
5294 return u32-1;
5295# endif
5296
5297# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5298 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
5299# if defined(RTASM_ARM64_USE_FEAT_LSE)
5300 uint32_t u32NewRet;
5301 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
5302# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5303 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5304# else
5305 RTASM_ARM_DMB_SY
5306 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5307# endif
5308 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
5309 : [pMem] "+Q" (*pu32)
5310 , [uNewRet] "=&r" (u32NewRet)
5311 : [uAddend] "r" (~(uint32_t)0)
5312 : );
5313# else
5314 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5315 "sub %w[uNew], %w[uNew], #1\n\t",
5316 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5317 "X" (0) /* dummy */);
5318# endif
5319 return u32NewRet;
5320
5321# else
5322 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5323# endif
5324}
5325#endif
5326
5327
5328/**
5329 * Atomically decrement a signed 32-bit value, ordered.
5330 *
5331 * @returns The new value.
5332 * @param pi32 Pointer to the value to decrement.
5333 *
5334 * @remarks x86: Requires a 486 or later.
5335 */
5336DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5337{
5338 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5339}
5340
5341
5342/**
5343 * Atomically decrement an unsigned 64-bit value, ordered.
5344 *
5345 * @returns The new value.
5346 * @param pu64 Pointer to the value to decrement.
5347 *
5348 * @remarks x86: Requires a Pentium or later.
5349 */
5350#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5351RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5352#else
5353DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5354{
5355# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5356 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5357
5358# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5359 uint64_t u64;
5360 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5361 : "=r" (u64)
5362 , "=m" (*pu64)
5363 : "0" (~(uint64_t)0)
5364 , "m" (*pu64)
5365 : "memory"
5366 , "cc");
5367 return u64-1;
5368
5369# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5370# if defined(RTASM_ARM64_USE_FEAT_LSE)
5371 uint64_t u64NewRet;
5372 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5373# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5374 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5375# else
5376 RTASM_ARM_DMB_SY
5377 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5378# endif
5379 "sub %[uNewRet], %[uNewRet], #1\n\t"
5380 : [pMem] "+Q" (*pu64)
5381 , [uNewRet] "=&r" (u64NewRet)
5382 : [uAddend] "r" (~(uint64_t)0)
5383 : );
5384# else
5385 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5386 "sub %[uNew], %[uNew], #1\n\t"
5387 ,
5388 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5389 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5390 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5391# endif
5392 return u64NewRet;
5393
5394# else
5395 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5396# endif
5397}
5398#endif
5399
5400
5401/**
5402 * Atomically decrement a signed 64-bit value, ordered.
5403 *
5404 * @returns The new value.
5405 * @param pi64 Pointer to the value to decrement.
5406 *
5407 * @remarks x86: Requires a Pentium or later.
5408 */
5409DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5410{
5411 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5412}
5413
5414
5415/**
5416 * Atomically decrement a size_t value, ordered.
5417 *
5418 * @returns The new value.
5419 * @param pcb Pointer to the value to decrement.
5420 *
5421 * @remarks x86: Requires a 486 or later.
5422 */
5423DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5424{
5425#if ARCH_BITS == 64
5426 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5427#elif ARCH_BITS == 32
5428 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5429#elif ARCH_BITS == 16
5430 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5431#else
5432# error "Unsupported ARCH_BITS value"
5433#endif
5434}
5435
5436
5437/**
5438 * Atomically Or an unsigned 32-bit value, ordered.
5439 *
5440 * @param pu32 Pointer to the pointer variable to OR u32 with.
5441 * @param u32 The value to OR *pu32 with.
5442 *
5443 * @remarks x86: Requires a 386 or later.
5444 */
5445#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5446RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5447#else
5448DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5449{
5450# if RT_INLINE_ASM_USES_INTRIN
5451 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5452
5453# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5454# if RT_INLINE_ASM_GNU_STYLE
5455 __asm__ __volatile__("lock; orl %1, %0\n\t"
5456 : "=m" (*pu32)
5457 : "ir" (u32)
5458 , "m" (*pu32)
5459 : "cc");
5460# else
5461 __asm
5462 {
5463 mov eax, [u32]
5464# ifdef RT_ARCH_AMD64
5465 mov rdx, [pu32]
5466 lock or [rdx], eax
5467# else
5468 mov edx, [pu32]
5469 lock or [edx], eax
5470# endif
5471 }
5472# endif
5473
5474# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5475# if defined(RTASM_ARM64_USE_FEAT_LSE)
5476# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5477 uint32_t u32Spill;
5478 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5479 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5480 : [pMem] "+Q" (*pu32)
5481 , [uSpill] "=&r" (u32Spill)
5482 : [fBitsToSet] "r" (u32)
5483 : );
5484# else
5485 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5486 RTASM_ARM_DMB_SY
5487 "stset %w[fBitsToSet], %[pMem]\n\t"
5488 : [pMem] "+Q" (*pu32)
5489 : [fBitsToSet] "r" (u32)
5490 : );
5491# endif
5492# else
5493 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5494 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5495 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5496 "orr %[uNew], %[uNew], %[uVal]\n\t",
5497 [uVal] "r" (u32));
5498
5499# endif
5500# else
5501# error "Port me"
5502# endif
5503}
5504#endif
5505
5506
5507/**
5508 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5509 * fallback).
5510 *
5511 * @returns Old value.
5512 * @param pu32 Pointer to the variable to OR @a u32 with.
5513 * @param u32 The value to OR @a *pu32 with.
5514 */
5515DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5516{
5517#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5518# if defined(RTASM_ARM64_USE_FEAT_LSE)
5519 uint32_t u32OldRet;
5520 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5521# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5522 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5523# else
5524 RTASM_ARM_DMB_SY
5525 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5526# endif
5527 : [pMem] "+Q" (*pu32)
5528 , [uOldRet] "=&r" (u32OldRet)
5529 : [fBitsToSet] "r" (u32)
5530 : );
5531# else
5532 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5533 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5534 "orr %[uNew], %[uOld], %[uVal]\n\t",
5535 [uVal] "r" (u32));
5536# endif
5537 return u32OldRet;
5538
5539#else
5540 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5541 uint32_t u32New;
5542 do
5543 u32New = u32RetOld | u32;
5544 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5545 return u32RetOld;
5546#endif
5547}
5548
5549
5550/**
5551 * Atomically Or a signed 32-bit value, ordered.
5552 *
5553 * @param pi32 Pointer to the pointer variable to OR u32 with.
5554 * @param i32 The value to OR *pu32 with.
5555 *
5556 * @remarks x86: Requires a 386 or later.
5557 */
5558DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5559{
5560 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5561}
5562
5563
5564/**
5565 * Atomically Or an unsigned 64-bit value, ordered.
5566 *
5567 * @param pu64 Pointer to the pointer variable to OR u64 with.
5568 * @param u64 The value to OR *pu64 with.
5569 *
5570 * @remarks x86: Requires a Pentium or later.
5571 */
5572#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5573DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5574#else
5575DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5576{
5577# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5578 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5579
5580# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5581 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5582 : "=m" (*pu64)
5583 : "r" (u64)
5584 , "m" (*pu64)
5585 : "cc");
5586
5587# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5588# if defined(RTASM_ARM64_USE_FEAT_LSE)
5589# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5590 uint64_t u64Spill;
5591 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5592 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5593 : [pMem] "+Q" (*pu64)
5594 , [uSpill] "=&r" (u64Spill)
5595 : [fBitsToSet] "r" (u64)
5596 : );
5597# else
5598 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5599 RTASM_ARM_DMB_SY
5600 "stset %[fBitsToSet], %[pMem]\n\t"
5601 : [pMem] "+Q" (*pu64)
5602 : [fBitsToSet] "r" (u64)
5603 : );
5604# endif
5605# else
5606 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5607 "orr %[uNew], %[uNew], %[uVal]\n\t"
5608 ,
5609 "orr %[uNew], %[uNew], %[uVal]\n\t"
5610 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5611 [uVal] "r" (u64));
5612# endif
5613
5614# else
5615 for (;;)
5616 {
5617 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5618 uint64_t u64New = u64Old | u64;
5619 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5620 break;
5621 ASMNopPause();
5622 }
5623# endif
5624}
5625#endif
5626
5627
5628/**
5629 * Atomically Or a signed 64-bit value, ordered.
5630 *
5631 * @param pi64 Pointer to the pointer variable to OR u64 with.
5632 * @param i64 The value to OR *pu64 with.
5633 *
5634 * @remarks x86: Requires a Pentium or later.
5635 */
5636DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5637{
5638 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5639}
5640
5641
5642/**
5643 * Atomically And an unsigned 32-bit value, ordered.
5644 *
5645 * @param pu32 Pointer to the pointer variable to AND u32 with.
5646 * @param u32 The value to AND *pu32 with.
5647 *
5648 * @remarks x86: Requires a 386 or later.
5649 */
5650#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5651RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5652#else
5653DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5654{
5655# if RT_INLINE_ASM_USES_INTRIN
5656 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5657
5658# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5659# if RT_INLINE_ASM_GNU_STYLE
5660 __asm__ __volatile__("lock; andl %1, %0\n\t"
5661 : "=m" (*pu32)
5662 : "ir" (u32)
5663 , "m" (*pu32)
5664 : "cc");
5665# else
5666 __asm
5667 {
5668 mov eax, [u32]
5669# ifdef RT_ARCH_AMD64
5670 mov rdx, [pu32]
5671 lock and [rdx], eax
5672# else
5673 mov edx, [pu32]
5674 lock and [edx], eax
5675# endif
5676 }
5677# endif
5678
5679# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5680# if defined(RTASM_ARM64_USE_FEAT_LSE)
5681# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5682 uint32_t u32Spill;
5683 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5684 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5685 : [pMem] "+Q" (*pu32)
5686 , [uSpill] "=&r" (u32Spill)
5687 : [fBitsToClear] "r" (~u32)
5688 : );
5689# else
5690 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5691 RTASM_ARM_DMB_SY
5692 "stclr %w[fBitsToClear], %[pMem]\n\t"
5693 : [pMem] "+Q" (*pu32)
5694 : [fBitsToClear] "r" (~u32)
5695 : );
5696# endif
5697# else
5698 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5699 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5700 "and %[uNew], %[uNew], %[uVal]\n\t",
5701 [uVal] "r" (u32));
5702
5703# endif
5704# else
5705# error "Port me"
5706# endif
5707}
5708#endif
5709
5710
5711/**
5712 * Atomically AND an unsigned 32-bit value, ordered, extended version.
5713 *
5714 * @returns Old value.
5715 * @param pu32 Pointer to the variable to AND @a u32 with.
5716 * @param u32 The value to AND @a *pu32 with.
5717 */
5718DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5719{
5720#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5721# if defined(RTASM_ARM64_USE_FEAT_LSE)
5722 uint32_t u32OldRet;
5723 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
5724# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5725 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5726# else
5727 RTASM_ARM_DMB_SY
5728 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
5729# endif
5730 : [pMem] "+Q" (*pu32)
5731 , [uOldRet] "=&r" (u32OldRet)
5732 : [fBitsToClear] "r" (~u32)
5733 : );
5734# else
5735 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
5736 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
5737 "and %[uNew], %[uOld], %[uVal]\n\t",
5738 [uVal] "r" (u32));
5739# endif
5740 return u32OldRet;
5741
5742#else
5743 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5744 uint32_t u32New;
5745 do
5746 u32New = u32RetOld & u32;
5747 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5748 return u32RetOld;
5749#endif
5750}
5751
5752
5753/**
5754 * Atomically And a signed 32-bit value, ordered.
5755 *
5756 * @param pi32 Pointer to the pointer variable to AND i32 with.
5757 * @param i32 The value to AND *pi32 with.
5758 *
5759 * @remarks x86: Requires a 386 or later.
5760 */
5761DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5762{
5763 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5764}
5765
5766
5767/**
5768 * Atomically And an unsigned 64-bit value, ordered.
5769 *
5770 * @param pu64 Pointer to the pointer variable to AND u64 with.
5771 * @param u64 The value to AND *pu64 with.
5772 *
5773 * @remarks x86: Requires a Pentium or later.
5774 */
5775#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5776DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5777#else
5778DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5779{
5780# if RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_AMD64)
5781 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
5782
5783# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5784 __asm__ __volatile__("lock; andq %1, %0\n\t"
5785 : "=m" (*pu64)
5786 : "r" (u64)
5787 , "m" (*pu64)
5788 : "cc");
5789
5790# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5791# if defined(RTASM_ARM64_USE_FEAT_LSE)
5792# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5793 uint64_t u64Spill;
5794 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5795 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
5796 : [pMem] "+Q" (*pu64)
5797 , [uSpill] "=&r" (u64Spill)
5798 : [fBitsToClear] "r" (~u64)
5799 : );
5800# else
5801 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
5802 RTASM_ARM_DMB_SY
5803 "stclr %[fBitsToClear], %[pMem]\n\t"
5804 : [pMem] "+Q" (*pu64)
5805 : [fBitsToClear] "r" (~u64)
5806 : );
5807# endif
5808# else
5809 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
5810 "and %[uNew], %[uNew], %[uVal]\n\t"
5811 ,
5812 "and %[uNew], %[uNew], %[uVal]\n\t"
5813 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
5814 [uVal] "r" (u64));
5815# endif
5816
5817# else
5818 for (;;)
5819 {
5820 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5821 uint64_t u64New = u64Old & u64;
5822 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5823 break;
5824 ASMNopPause();
5825 }
5826# endif
5827}
5828#endif
5829
5830
5831/**
5832 * Atomically And a signed 64-bit value, ordered.
5833 *
5834 * @param pi64 Pointer to the pointer variable to AND i64 with.
5835 * @param i64 The value to AND *pi64 with.
5836 *
5837 * @remarks x86: Requires a Pentium or later.
5838 */
5839DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5840{
5841 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5842}
5843
5844
5845/**
5846 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
5847 *
5848 * @param pu32 Pointer to the variable to XOR @a u32 with.
5849 * @param u32 The value to XOR @a *pu32 with.
5850 *
5851 * @remarks x86: Requires a 386 or later.
5852 */
5853#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5854RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5855#else
5856DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5857{
5858# if RT_INLINE_ASM_USES_INTRIN
5859 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
5860
5861# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5862# if RT_INLINE_ASM_GNU_STYLE
5863 __asm__ __volatile__("lock; xorl %1, %0\n\t"
5864 : "=m" (*pu32)
5865 : "ir" (u32)
5866 , "m" (*pu32)
5867 : "cc");
5868# else
5869 __asm
5870 {
5871 mov eax, [u32]
5872# ifdef RT_ARCH_AMD64
5873 mov rdx, [pu32]
5874 lock xor [rdx], eax
5875# else
5876 mov edx, [pu32]
5877 lock xor [edx], eax
5878# endif
5879 }
5880# endif
5881
5882# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5883# if defined(RTASM_ARM64_USE_FEAT_LSE)
5884# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5885 uint32_t u32Spill;
5886 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5887 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
5888 : [pMem] "+Q" (*pu32)
5889 , [uSpill] "=&r" (u32Spill)
5890 : [fBitMask] "r" (u32)
5891 : );
5892# else
5893 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
5894 RTASM_ARM_DMB_SY
5895 "steor %w[fBitMask], %[pMem]\n\t"
5896 : [pMem] "+Q" (*pu32)
5897 : [fBitMask] "r" (u32)
5898 : );
5899# endif
5900# else
5901 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
5902 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
5903 "eor %[uNew], %[uNew], %[uVal]\n\t",
5904 [uVal] "r" (u32));
5905# endif
5906
5907# else
5908# error "Port me"
5909# endif
5910}
5911#endif
5912
5913
5914/**
5915 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
5916 * extended version (for bitmaps).
5917 *
5918 * @returns Old value.
5919 * @param pu32 Pointer to the variable to XOR @a u32 with.
5920 * @param u32 The value to XOR @a *pu32 with.
5921 */
5922DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5923{
5924#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5925# if defined(RTASM_ARM64_USE_FEAT_LSE)
5926 uint32_t u32OldRet;
5927 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
5928# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5929 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5930# else
5931 RTASM_ARM_DMB_SY
5932 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
5933# endif
5934 : [pMem] "+Q" (*pu32)
5935 , [uOldRet] "=&r" (u32OldRet)
5936 : [fBitMask] "r" (u32)
5937 : );
5938# else
5939 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
5940 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
5941 "eor %[uNew], %[uOld], %[uVal]\n\t",
5942 [uVal] "r" (u32));
5943# endif
5944 return u32OldRet;
5945
5946#else
5947 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5948 uint32_t u32New;
5949 do
5950 u32New = u32RetOld ^ u32;
5951 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5952 return u32RetOld;
5953#endif
5954}
5955
5956
5957/**
5958 * Atomically XOR a signed 32-bit value, ordered.
5959 *
5960 * @param pi32 Pointer to the variable to XOR i32 with.
5961 * @param i32 The value to XOR *pi32 with.
5962 *
5963 * @remarks x86: Requires a 386 or later.
5964 */
5965DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5966{
5967 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5968}
5969
5970
5971/**
5972 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
5973 *
5974 * @param pu32 Pointer to the pointer variable to OR u32 with.
5975 * @param u32 The value to OR *pu32 with.
5976 *
5977 * @remarks x86: Requires a 386 or later.
5978 */
5979#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
5980RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5981#else
5982DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5983{
5984# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5985# if RT_INLINE_ASM_GNU_STYLE
5986 __asm__ __volatile__("orl %1, %0\n\t"
5987 : "=m" (*pu32)
5988 : "ir" (u32)
5989 , "m" (*pu32)
5990 : "cc");
5991# else
5992 __asm
5993 {
5994 mov eax, [u32]
5995# ifdef RT_ARCH_AMD64
5996 mov rdx, [pu32]
5997 or [rdx], eax
5998# else
5999 mov edx, [pu32]
6000 or [edx], eax
6001# endif
6002 }
6003# endif
6004
6005# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6006 /* M1 benchmark: stset=1974 vs non-lse=6271 */
6007# if defined(RTASM_ARM64_USE_FEAT_LSE)
6008 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
6009 "stset %w[fBitsToSet], %[pMem]\n\t"
6010 : [pMem] "+Q" (*pu32)
6011 : [fBitsToSet] "r" (u32)
6012 : );
6013# else
6014 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
6015 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
6016 "orr %[uNew], %[uNew], %[uVal]\n\t",
6017 [uVal] "r" (u32));
6018# endif
6019
6020# else
6021# error "Port me"
6022# endif
6023}
6024#endif
6025
6026
6027/**
6028 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
6029 * extended version (for bitmap fallback).
6030 *
6031 * @returns Old value.
6032 * @param pu32 Pointer to the variable to OR @a u32 with.
6033 * @param u32 The value to OR @a *pu32 with.
6034 */
6035DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6036{
6037#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6038# if defined(RTASM_ARM64_USE_FEAT_LSE)
6039 uint32_t u32OldRet;
6040 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
6041 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
6042 : [pMem] "+Q" (*pu32)
6043 , [uOldRet] "=&r" (u32OldRet)
6044 : [fBitsToSet] "r" (u32)
6045 : );
6046# else
6047 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
6048 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
6049 "orr %[uNew], %[uOld], %[uVal]\n\t",
6050 [uVal] "r" (u32));
6051# endif
6052 return u32OldRet;
6053
6054#else
6055 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6056#endif
6057}
6058
6059
6060/**
6061 * Atomically OR a signed 32-bit value, unordered.
6062 *
6063 * @param pi32 Pointer to the pointer variable to OR u32 with.
6064 * @param i32 The value to OR *pu32 with.
6065 *
6066 * @remarks x86: Requires a 386 or later.
6067 */
6068DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6069{
6070 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6071}
6072
6073
6074/**
6075 * Atomically OR an unsigned 64-bit value, unordered.
6076 *
6077 * @param pu64 Pointer to the pointer variable to OR u64 with.
6078 * @param u64 The value to OR *pu64 with.
6079 *
6080 * @remarks x86: Requires a Pentium or later.
6081 */
6082#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6083DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6084#else
6085DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6086{
6087# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6088 __asm__ __volatile__("orq %1, %q0\n\t"
6089 : "=m" (*pu64)
6090 : "r" (u64)
6091 , "m" (*pu64)
6092 : "cc");
6093
6094# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6095# if defined(RTASM_ARM64_USE_FEAT_LSE)
6096 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
6097 "stset %[fBitsToSet], %[pMem]\n\t"
6098 : [pMem] "+Q" (*pu64)
6099 : [fBitsToSet] "r" (u64)
6100 : );
6101# else
6102 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
6103 "orr %[uNew], %[uNew], %[uVal]\n\t"
6104 ,
6105 "orr %[uNew], %[uNew], %[uVal]\n\t"
6106 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
6107 [uVal] "r" (u64));
6108# endif
6109
6110# else
6111 for (;;)
6112 {
6113 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6114 uint64_t u64New = u64Old | u64;
6115 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6116 break;
6117 ASMNopPause();
6118 }
6119# endif
6120}
6121#endif
6122
6123
6124/**
6125 * Atomically Or a signed 64-bit value, unordered.
6126 *
6127 * @param pi64 Pointer to the pointer variable to OR u64 with.
6128 * @param i64 The value to OR *pu64 with.
6129 *
6130 * @remarks x86: Requires a Pentium or later.
6131 */
6132DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6133{
6134 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6135}
6136
6137
6138/**
6139 * Atomically And an unsigned 32-bit value, unordered.
6140 *
6141 * @param pu32 Pointer to the pointer variable to AND u32 with.
6142 * @param u32 The value to AND *pu32 with.
6143 *
6144 * @remarks x86: Requires a 386 or later.
6145 */
6146#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6147RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6148#else
6149DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6150{
6151# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6152# if RT_INLINE_ASM_GNU_STYLE
6153 __asm__ __volatile__("andl %1, %0\n\t"
6154 : "=m" (*pu32)
6155 : "ir" (u32)
6156 , "m" (*pu32)
6157 : "cc");
6158# else
6159 __asm
6160 {
6161 mov eax, [u32]
6162# ifdef RT_ARCH_AMD64
6163 mov rdx, [pu32]
6164 and [rdx], eax
6165# else
6166 mov edx, [pu32]
6167 and [edx], eax
6168# endif
6169 }
6170# endif
6171
6172# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6173 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
6174# if defined(RTASM_ARM64_USE_FEAT_LSE)
6175 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
6176 "stclr %w[fBitsToClear], %[pMem]\n\t"
6177 : [pMem] "+Q" (*pu32)
6178 : [fBitsToClear] "r" (~u32)
6179 : );
6180# else
6181 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
6182 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
6183 "and %[uNew], %[uNew], %[uVal]\n\t",
6184 [uVal] "r" (u32));
6185# endif
6186
6187# else
6188# error "Port me"
6189# endif
6190}
6191#endif
6192
6193
6194/**
6195 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
6196 * bitmap fallback).
6197 *
6198 * @returns Old value.
6199 * @param pu32 Pointer to the pointer to AND @a u32 with.
6200 * @param u32 The value to AND @a *pu32 with.
6201 */
6202DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6203{
6204#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6205# if defined(RTASM_ARM64_USE_FEAT_LSE)
6206 uint32_t u32OldRet;
6207 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6208 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6209 : [pMem] "+Q" (*pu32)
6210 , [uOldRet] "=&r" (u32OldRet)
6211 : [fBitsToClear] "r" (~u32)
6212 : );
6213# else
6214 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
6215 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6216 "and %[uNew], %[uOld], %[uVal]\n\t",
6217 [uVal] "r" (u32));
6218# endif
6219 return u32OldRet;
6220
6221#else
6222 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6223#endif
6224}
6225
6226
6227/**
6228 * Atomically And a signed 32-bit value, unordered.
6229 *
6230 * @param pi32 Pointer to the pointer variable to AND i32 with.
6231 * @param i32 The value to AND *pi32 with.
6232 *
6233 * @remarks x86: Requires a 386 or later.
6234 */
6235DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6236{
6237 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6238}
6239
6240
6241/**
6242 * Atomically And an unsigned 64-bit value, unordered.
6243 *
6244 * @param pu64 Pointer to the pointer variable to AND u64 with.
6245 * @param u64 The value to AND *pu64 with.
6246 *
6247 * @remarks x86: Requires a Pentium or later.
6248 */
6249#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6250DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6251#else
6252DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6253{
6254# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6255 __asm__ __volatile__("andq %1, %0\n\t"
6256 : "=m" (*pu64)
6257 : "r" (u64)
6258 , "m" (*pu64)
6259 : "cc");
6260
6261# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6262# if defined(RTASM_ARM64_USE_FEAT_LSE)
6263 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
6264 "stclr %[fBitsToClear], %[pMem]\n\t"
6265 : [pMem] "+Q" (*pu64)
6266 : [fBitsToClear] "r" (~u64)
6267 : );
6268# else
6269 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
6270 "and %[uNew], %[uNew], %[uVal]\n\t"
6271 ,
6272 "and %[uNew], %[uNew], %[uVal]\n\t"
6273 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6274 [uVal] "r" (u64));
6275# endif
6276
6277# else
6278 for (;;)
6279 {
6280 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6281 uint64_t u64New = u64Old & u64;
6282 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6283 break;
6284 ASMNopPause();
6285 }
6286# endif
6287}
6288#endif
6289
6290
6291/**
6292 * Atomically And a signed 64-bit value, unordered.
6293 *
6294 * @param pi64 Pointer to the pointer variable to AND i64 with.
6295 * @param i64 The value to AND *pi64 with.
6296 *
6297 * @remarks x86: Requires a Pentium or later.
6298 */
6299DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6300{
6301 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6302}
6303
6304
6305/**
6306 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
6307 *
6308 * @param pu32 Pointer to the variable to XOR @a u32 with.
6309 * @param u32 The value to OR @a *pu32 with.
6310 *
6311 * @remarks x86: Requires a 386 or later.
6312 */
6313#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6314RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6315#else
6316DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6317{
6318# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6319# if RT_INLINE_ASM_GNU_STYLE
6320 __asm__ __volatile__("xorl %1, %0\n\t"
6321 : "=m" (*pu32)
6322 : "ir" (u32)
6323 , "m" (*pu32)
6324 : "cc");
6325# else
6326 __asm
6327 {
6328 mov eax, [u32]
6329# ifdef RT_ARCH_AMD64
6330 mov rdx, [pu32]
6331 xor [rdx], eax
6332# else
6333 mov edx, [pu32]
6334 xor [edx], eax
6335# endif
6336 }
6337# endif
6338
6339# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6340# if defined(RTASM_ARM64_USE_FEAT_LSE)
6341 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6342 "steor %w[fBitMask], %[pMem]\n\t"
6343 : [pMem] "+Q" (*pu32)
6344 : [fBitMask] "r" (u32)
6345 : );
6346# else
6347 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6348 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6349 "eor %[uNew], %[uNew], %[uVal]\n\t",
6350 [uVal] "r" (u32));
6351# endif
6352
6353# else
6354# error "Port me"
6355# endif
6356}
6357#endif
6358
6359
6360/**
6361 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6362 * extended version (for bitmap fallback).
6363 *
6364 * @returns Old value.
6365 * @param pu32 Pointer to the variable to XOR @a u32 with.
6366 * @param u32 The value to OR @a *pu32 with.
6367 */
6368DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6369{
6370#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6371# if defined(RTASM_ARM64_USE_FEAT_LSE)
6372 uint32_t u32OldRet;
6373 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6374 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6375 : [pMem] "+Q" (*pu32)
6376 , [uOldRet] "=&r" (u32OldRet)
6377 : [fBitMask] "r" (u32)
6378 : );
6379# else
6380 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6381 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6382 "eor %[uNew], %[uOld], %[uVal]\n\t",
6383 [uVal] "r" (u32));
6384# endif
6385 return u32OldRet;
6386
6387#else
6388 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6389#endif
6390}
6391
6392
6393/**
6394 * Atomically XOR a signed 32-bit value, unordered.
6395 *
6396 * @param pi32 Pointer to the variable to XOR @a u32 with.
6397 * @param i32 The value to XOR @a *pu32 with.
6398 *
6399 * @remarks x86: Requires a 386 or later.
6400 */
6401DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6402{
6403 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6404}
6405
6406
6407/**
6408 * Atomically increment an unsigned 32-bit value, unordered.
6409 *
6410 * @returns the new value.
6411 * @param pu32 Pointer to the variable to increment.
6412 *
6413 * @remarks x86: Requires a 486 or later.
6414 */
6415#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6416RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6417#else
6418DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6419{
6420# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6421 uint32_t u32;
6422# if RT_INLINE_ASM_GNU_STYLE
6423 __asm__ __volatile__("xaddl %0, %1\n\t"
6424 : "=r" (u32)
6425 , "=m" (*pu32)
6426 : "0" (1)
6427 , "m" (*pu32)
6428 : "memory" /** @todo why 'memory'? */
6429 , "cc");
6430 return u32 + 1;
6431# else
6432 __asm
6433 {
6434 mov eax, 1
6435# ifdef RT_ARCH_AMD64
6436 mov rdx, [pu32]
6437 xadd [rdx], eax
6438# else
6439 mov edx, [pu32]
6440 xadd [edx], eax
6441# endif
6442 mov u32, eax
6443 }
6444 return u32 + 1;
6445# endif
6446
6447# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6448 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6449# if defined(RTASM_ARM64_USE_FEAT_LSE)
6450 uint32_t u32NewRet;
6451 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6452 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6453 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6454 : [pMem] "+Q" (*pu32)
6455 , [uNewRet] "=&r" (u32NewRet)
6456 : [uAddend] "r" ((uint32_t)1)
6457 : );
6458# else
6459 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6460 "add %w[uNew], %w[uNew], #1\n\t",
6461 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6462 "X" (0) /* dummy */);
6463# endif
6464 return u32NewRet;
6465
6466# else
6467# error "Port me"
6468# endif
6469}
6470#endif
6471
6472
6473/**
6474 * Atomically decrement an unsigned 32-bit value, unordered.
6475 *
6476 * @returns the new value.
6477 * @param pu32 Pointer to the variable to decrement.
6478 *
6479 * @remarks x86: Requires a 486 or later.
6480 */
6481#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6482RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6483#else
6484DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6485{
6486# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6487 uint32_t u32;
6488# if RT_INLINE_ASM_GNU_STYLE
6489 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6490 : "=r" (u32)
6491 , "=m" (*pu32)
6492 : "0" (-1)
6493 , "m" (*pu32)
6494 : "memory"
6495 , "cc");
6496 return u32 - 1;
6497# else
6498 __asm
6499 {
6500 mov eax, -1
6501# ifdef RT_ARCH_AMD64
6502 mov rdx, [pu32]
6503 xadd [rdx], eax
6504# else
6505 mov edx, [pu32]
6506 xadd [edx], eax
6507# endif
6508 mov u32, eax
6509 }
6510 return u32 - 1;
6511# endif
6512
6513# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6514 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6515# if defined(RTASM_ARM64_USE_FEAT_LSE)
6516 uint32_t u32NewRet;
6517 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6518 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6519 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6520 : [pMem] "+Q" (*pu32)
6521 , [uNewRet] "=&r" (u32NewRet)
6522 : [uAddend] "r" (~(uint32_t)0)
6523 : );
6524# else
6525 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6526 "sub %w[uNew], %w[uNew], #1\n\t",
6527 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6528 "X" (0) /* dummy */);
6529# endif
6530 return u32NewRet;
6531
6532# else
6533# error "Port me"
6534# endif
6535}
6536#endif
6537
6538/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6539 * header as it's a common reason for including asm.h. */
6540
6541
6542/**
6543 * Reverse the byte order of the given 16-bit integer.
6544 *
6545 * @returns Revert
6546 * @param u16 16-bit integer value.
6547 */
6548#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6549RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6550#else
6551DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6552{
6553# if RT_INLINE_ASM_USES_INTRIN
6554 return _byteswap_ushort(u16);
6555
6556# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6557# if RT_INLINE_ASM_GNU_STYLE
6558 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6559# else
6560 _asm
6561 {
6562 mov ax, [u16]
6563 ror ax, 8
6564 mov [u16], ax
6565 }
6566# endif
6567 return u16;
6568
6569# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6570 uint32_t u32Ret;
6571 __asm__ __volatile__(
6572# if defined(RT_ARCH_ARM64)
6573 "rev16 %w[uRet], %w[uVal]\n\t"
6574# else
6575 "rev16 %[uRet], %[uVal]\n\t"
6576# endif
6577 : [uRet] "=r" (u32Ret)
6578 : [uVal] "r" (u16));
6579 return (uint16_t)u32Ret;
6580
6581# else
6582# error "Port me"
6583# endif
6584}
6585#endif
6586
6587
6588/**
6589 * Reverse the byte order of the given 32-bit integer.
6590 *
6591 * @returns Revert
6592 * @param u32 32-bit integer value.
6593 */
6594#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6595RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6596#else
6597DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6598{
6599# if RT_INLINE_ASM_USES_INTRIN
6600 return _byteswap_ulong(u32);
6601
6602# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6603# if RT_INLINE_ASM_GNU_STYLE
6604 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6605# else
6606 _asm
6607 {
6608 mov eax, [u32]
6609 bswap eax
6610 mov [u32], eax
6611 }
6612# endif
6613 return u32;
6614
6615# elif defined(RT_ARCH_ARM64)
6616 uint64_t u64Ret;
6617 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6618 : [uRet] "=r" (u64Ret)
6619 : [uVal] "r" ((uint64_t)u32));
6620 return (uint32_t)u64Ret;
6621
6622# elif defined(RT_ARCH_ARM32)
6623 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6624 : [uRet] "=r" (u32)
6625 : [uVal] "[uRet]" (u32));
6626 return u32;
6627
6628# else
6629# error "Port me"
6630# endif
6631}
6632#endif
6633
6634
6635/**
6636 * Reverse the byte order of the given 64-bit integer.
6637 *
6638 * @returns Revert
6639 * @param u64 64-bit integer value.
6640 */
6641DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
6642{
6643#if defined(RT_ARCH_AMD64) && RT_INLINE_ASM_USES_INTRIN
6644 return _byteswap_uint64(u64);
6645
6646# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6647 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
6648 return u64;
6649
6650# elif defined(RT_ARCH_ARM64)
6651 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6652 : [uRet] "=r" (u64)
6653 : [uVal] "[uRet]" (u64));
6654 return u64;
6655
6656#else
6657 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
6658 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
6659#endif
6660}
6661
6662
6663
6664/** @defgroup grp_inline_bits Bitmap Operations
6665 *
6666 * @todo Move these into a separate header, with standard IPRT prefix
6667 * (RTBitmapXxx). Move the more complex (searched) stuff into C source
6668 * files.
6669 *
6670 * @{
6671 */
6672
6673
6674/**
6675 * Sets a bit in a bitmap.
6676 *
6677 * @param pvBitmap Pointer to the bitmap (little endian). This should be
6678 * 32-bit aligned.
6679 * @param iBit The bit to set.
6680 *
6681 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6682 * However, doing so will yield better performance as well as avoiding
6683 * traps accessing the last bits in the bitmap.
6684 */
6685#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6686RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6687#else
6688DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6689{
6690# if RT_INLINE_ASM_USES_INTRIN
6691 _bittestandset((long RT_FAR *)pvBitmap, iBit);
6692
6693# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6694# if RT_INLINE_ASM_GNU_STYLE
6695 __asm__ __volatile__("btsl %1, %0"
6696 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6697 : "Ir" (iBit)
6698 , "m" (*(volatile long RT_FAR *)pvBitmap)
6699 : "memory"
6700 , "cc");
6701# else
6702 __asm
6703 {
6704# ifdef RT_ARCH_AMD64
6705 mov rax, [pvBitmap]
6706 mov edx, [iBit]
6707 bts [rax], edx
6708# else
6709 mov eax, [pvBitmap]
6710 mov edx, [iBit]
6711 bts [eax], edx
6712# endif
6713 }
6714# endif
6715
6716# else
6717 int32_t offBitmap = iBit / 32;
6718 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6719 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6720# endif
6721}
6722#endif
6723
6724
6725/**
6726 * Atomically sets a bit in a bitmap, ordered.
6727 *
6728 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6729 * aligned, otherwise the memory access isn't atomic!
6730 * @param iBit The bit to set.
6731 *
6732 * @remarks x86: Requires a 386 or later.
6733 */
6734#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6735RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6736#else
6737DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6738{
6739 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6740# if RT_INLINE_ASM_USES_INTRIN
6741 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
6742# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6743# if RT_INLINE_ASM_GNU_STYLE
6744 __asm__ __volatile__("lock; btsl %1, %0"
6745 : "=m" (*(volatile long *)pvBitmap)
6746 : "Ir" (iBit)
6747 , "m" (*(volatile long *)pvBitmap)
6748 : "memory"
6749 , "cc");
6750# else
6751 __asm
6752 {
6753# ifdef RT_ARCH_AMD64
6754 mov rax, [pvBitmap]
6755 mov edx, [iBit]
6756 lock bts [rax], edx
6757# else
6758 mov eax, [pvBitmap]
6759 mov edx, [iBit]
6760 lock bts [eax], edx
6761# endif
6762 }
6763# endif
6764
6765# else
6766 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6767# endif
6768}
6769#endif
6770
6771
6772/**
6773 * Clears a bit in a bitmap.
6774 *
6775 * @param pvBitmap Pointer to the bitmap (little endian).
6776 * @param iBit The bit to clear.
6777 *
6778 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6779 * However, doing so will yield better performance as well as avoiding
6780 * traps accessing the last bits in the bitmap.
6781 */
6782#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6783RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6784#else
6785DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6786{
6787# if RT_INLINE_ASM_USES_INTRIN
6788 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
6789
6790# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6791# if RT_INLINE_ASM_GNU_STYLE
6792 __asm__ __volatile__("btrl %1, %0"
6793 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6794 : "Ir" (iBit)
6795 , "m" (*(volatile long RT_FAR *)pvBitmap)
6796 : "memory"
6797 , "cc");
6798# else
6799 __asm
6800 {
6801# ifdef RT_ARCH_AMD64
6802 mov rax, [pvBitmap]
6803 mov edx, [iBit]
6804 btr [rax], edx
6805# else
6806 mov eax, [pvBitmap]
6807 mov edx, [iBit]
6808 btr [eax], edx
6809# endif
6810 }
6811# endif
6812
6813# else
6814 int32_t offBitmap = iBit / 32;
6815 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6816 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6817# endif
6818}
6819#endif
6820
6821
6822/**
6823 * Atomically clears a bit in a bitmap, ordered.
6824 *
6825 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6826 * aligned, otherwise the memory access isn't atomic!
6827 * @param iBit The bit to toggle set.
6828 *
6829 * @remarks No memory barrier, take care on smp.
6830 * @remarks x86: Requires a 386 or later.
6831 */
6832#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6833RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6834#else
6835DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6836{
6837 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6838# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6839# if RT_INLINE_ASM_GNU_STYLE
6840 __asm__ __volatile__("lock; btrl %1, %0"
6841 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6842 : "Ir" (iBit)
6843 , "m" (*(volatile long RT_FAR *)pvBitmap)
6844 : "memory"
6845 , "cc");
6846# else
6847 __asm
6848 {
6849# ifdef RT_ARCH_AMD64
6850 mov rax, [pvBitmap]
6851 mov edx, [iBit]
6852 lock btr [rax], edx
6853# else
6854 mov eax, [pvBitmap]
6855 mov edx, [iBit]
6856 lock btr [eax], edx
6857# endif
6858 }
6859# endif
6860# else
6861 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
6862# endif
6863}
6864#endif
6865
6866
6867/**
6868 * Toggles a bit in a bitmap.
6869 *
6870 * @param pvBitmap Pointer to the bitmap (little endian).
6871 * @param iBit The bit to toggle.
6872 *
6873 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6874 * However, doing so will yield better performance as well as avoiding
6875 * traps accessing the last bits in the bitmap.
6876 */
6877#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6878RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6879#else
6880DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6881{
6882# if RT_INLINE_ASM_USES_INTRIN
6883 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
6884# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6885# if RT_INLINE_ASM_GNU_STYLE
6886 __asm__ __volatile__("btcl %1, %0"
6887 : "=m" (*(volatile long *)pvBitmap)
6888 : "Ir" (iBit)
6889 , "m" (*(volatile long *)pvBitmap)
6890 : "memory"
6891 , "cc");
6892# else
6893 __asm
6894 {
6895# ifdef RT_ARCH_AMD64
6896 mov rax, [pvBitmap]
6897 mov edx, [iBit]
6898 btc [rax], edx
6899# else
6900 mov eax, [pvBitmap]
6901 mov edx, [iBit]
6902 btc [eax], edx
6903# endif
6904 }
6905# endif
6906# else
6907 int32_t offBitmap = iBit / 32;
6908 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
6909 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6910# endif
6911}
6912#endif
6913
6914
6915/**
6916 * Atomically toggles a bit in a bitmap, ordered.
6917 *
6918 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
6919 * aligned, otherwise the memory access isn't atomic!
6920 * @param iBit The bit to test and set.
6921 *
6922 * @remarks x86: Requires a 386 or later.
6923 */
6924#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6925RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6926#else
6927DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6928{
6929 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
6930# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6931# if RT_INLINE_ASM_GNU_STYLE
6932 __asm__ __volatile__("lock; btcl %1, %0"
6933 : "=m" (*(volatile long RT_FAR *)pvBitmap)
6934 : "Ir" (iBit)
6935 , "m" (*(volatile long RT_FAR *)pvBitmap)
6936 : "memory"
6937 , "cc");
6938# else
6939 __asm
6940 {
6941# ifdef RT_ARCH_AMD64
6942 mov rax, [pvBitmap]
6943 mov edx, [iBit]
6944 lock btc [rax], edx
6945# else
6946 mov eax, [pvBitmap]
6947 mov edx, [iBit]
6948 lock btc [eax], edx
6949# endif
6950 }
6951# endif
6952# else
6953 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
6954# endif
6955}
6956#endif
6957
6958
6959/**
6960 * Tests and sets a bit in a bitmap.
6961 *
6962 * @returns true if the bit was set.
6963 * @returns false if the bit was clear.
6964 *
6965 * @param pvBitmap Pointer to the bitmap (little endian).
6966 * @param iBit The bit to test and set.
6967 *
6968 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
6969 * However, doing so will yield better performance as well as avoiding
6970 * traps accessing the last bits in the bitmap.
6971 */
6972#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6973RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
6974#else
6975DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
6976{
6977 union { bool f; uint32_t u32; uint8_t u8; } rc;
6978# if RT_INLINE_ASM_USES_INTRIN
6979 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
6980
6981# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6982# if RT_INLINE_ASM_GNU_STYLE
6983 __asm__ __volatile__("btsl %2, %1\n\t"
6984 "setc %b0\n\t"
6985 "andl $1, %0\n\t"
6986 : "=q" (rc.u32)
6987 , "=m" (*(volatile long RT_FAR *)pvBitmap)
6988 : "Ir" (iBit)
6989 , "m" (*(volatile long RT_FAR *)pvBitmap)
6990 : "memory"
6991 , "cc");
6992# else
6993 __asm
6994 {
6995 mov edx, [iBit]
6996# ifdef RT_ARCH_AMD64
6997 mov rax, [pvBitmap]
6998 bts [rax], edx
6999# else
7000 mov eax, [pvBitmap]
7001 bts [eax], edx
7002# endif
7003 setc al
7004 and eax, 1
7005 mov [rc.u32], eax
7006 }
7007# endif
7008
7009# else
7010 int32_t offBitmap = iBit / 32;
7011 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7012 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7013 >> (iBit & 31);
7014 rc.u32 &= 1;
7015# endif
7016 return rc.f;
7017}
7018#endif
7019
7020
7021/**
7022 * Atomically tests and sets a bit in a bitmap, ordered.
7023 *
7024 * @returns true if the bit was set.
7025 * @returns false if the bit was clear.
7026 *
7027 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7028 * aligned, otherwise the memory access isn't atomic!
7029 * @param iBit The bit to set.
7030 *
7031 * @remarks x86: Requires a 386 or later.
7032 */
7033#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7034RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7035#else
7036DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7037{
7038 union { bool f; uint32_t u32; uint8_t u8; } rc;
7039 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7040# if RT_INLINE_ASM_USES_INTRIN
7041 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7042# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7043# if RT_INLINE_ASM_GNU_STYLE
7044 __asm__ __volatile__("lock; btsl %2, %1\n\t"
7045 "setc %b0\n\t"
7046 "andl $1, %0\n\t"
7047 : "=q" (rc.u32)
7048 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7049 : "Ir" (iBit)
7050 , "m" (*(volatile long RT_FAR *)pvBitmap)
7051 : "memory"
7052 , "cc");
7053# else
7054 __asm
7055 {
7056 mov edx, [iBit]
7057# ifdef RT_ARCH_AMD64
7058 mov rax, [pvBitmap]
7059 lock bts [rax], edx
7060# else
7061 mov eax, [pvBitmap]
7062 lock bts [eax], edx
7063# endif
7064 setc al
7065 and eax, 1
7066 mov [rc.u32], eax
7067 }
7068# endif
7069
7070# else
7071 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7072 >> (iBit & 31);
7073 rc.u32 &= 1;
7074# endif
7075 return rc.f;
7076}
7077#endif
7078
7079
7080/**
7081 * Tests and clears a bit in a bitmap.
7082 *
7083 * @returns true if the bit was set.
7084 * @returns false if the bit was clear.
7085 *
7086 * @param pvBitmap Pointer to the bitmap (little endian).
7087 * @param iBit The bit to test and clear.
7088 *
7089 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7090 * However, doing so will yield better performance as well as avoiding
7091 * traps accessing the last bits in the bitmap.
7092 */
7093#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7094RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7095#else
7096DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7097{
7098 union { bool f; uint32_t u32; uint8_t u8; } rc;
7099# if RT_INLINE_ASM_USES_INTRIN
7100 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7101
7102# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7103# if RT_INLINE_ASM_GNU_STYLE
7104 __asm__ __volatile__("btrl %2, %1\n\t"
7105 "setc %b0\n\t"
7106 "andl $1, %0\n\t"
7107 : "=q" (rc.u32)
7108 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7109 : "Ir" (iBit)
7110 , "m" (*(volatile long RT_FAR *)pvBitmap)
7111 : "memory"
7112 , "cc");
7113# else
7114 __asm
7115 {
7116 mov edx, [iBit]
7117# ifdef RT_ARCH_AMD64
7118 mov rax, [pvBitmap]
7119 btr [rax], edx
7120# else
7121 mov eax, [pvBitmap]
7122 btr [eax], edx
7123# endif
7124 setc al
7125 and eax, 1
7126 mov [rc.u32], eax
7127 }
7128# endif
7129
7130# else
7131 int32_t offBitmap = iBit / 32;
7132 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7133 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7134 >> (iBit & 31);
7135 rc.u32 &= 1;
7136# endif
7137 return rc.f;
7138}
7139#endif
7140
7141
7142/**
7143 * Atomically tests and clears a bit in a bitmap, ordered.
7144 *
7145 * @returns true if the bit was set.
7146 * @returns false if the bit was clear.
7147 *
7148 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7149 * aligned, otherwise the memory access isn't atomic!
7150 * @param iBit The bit to test and clear.
7151 *
7152 * @remarks No memory barrier, take care on smp.
7153 * @remarks x86: Requires a 386 or later.
7154 */
7155#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7156RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7157#else
7158DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7159{
7160 union { bool f; uint32_t u32; uint8_t u8; } rc;
7161 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7162# if RT_INLINE_ASM_USES_INTRIN
7163 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
7164
7165# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7166# if RT_INLINE_ASM_GNU_STYLE
7167 __asm__ __volatile__("lock; btrl %2, %1\n\t"
7168 "setc %b0\n\t"
7169 "andl $1, %0\n\t"
7170 : "=q" (rc.u32)
7171 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7172 : "Ir" (iBit)
7173 , "m" (*(volatile long RT_FAR *)pvBitmap)
7174 : "memory"
7175 , "cc");
7176# else
7177 __asm
7178 {
7179 mov edx, [iBit]
7180# ifdef RT_ARCH_AMD64
7181 mov rax, [pvBitmap]
7182 lock btr [rax], edx
7183# else
7184 mov eax, [pvBitmap]
7185 lock btr [eax], edx
7186# endif
7187 setc al
7188 and eax, 1
7189 mov [rc.u32], eax
7190 }
7191# endif
7192
7193# else
7194 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7195 >> (iBit & 31);
7196 rc.u32 &= 1;
7197# endif
7198 return rc.f;
7199}
7200#endif
7201
7202
7203/**
7204 * Tests and toggles a bit in a bitmap.
7205 *
7206 * @returns true if the bit was set.
7207 * @returns false if the bit was clear.
7208 *
7209 * @param pvBitmap Pointer to the bitmap (little endian).
7210 * @param iBit The bit to test and toggle.
7211 *
7212 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7213 * However, doing so will yield better performance as well as avoiding
7214 * traps accessing the last bits in the bitmap.
7215 */
7216#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7217RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7218#else
7219DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7220{
7221 union { bool f; uint32_t u32; uint8_t u8; } rc;
7222# if RT_INLINE_ASM_USES_INTRIN
7223 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7224
7225# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7226# if RT_INLINE_ASM_GNU_STYLE
7227 __asm__ __volatile__("btcl %2, %1\n\t"
7228 "setc %b0\n\t"
7229 "andl $1, %0\n\t"
7230 : "=q" (rc.u32)
7231 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7232 : "Ir" (iBit)
7233 , "m" (*(volatile long RT_FAR *)pvBitmap)
7234 : "memory"
7235 , "cc");
7236# else
7237 __asm
7238 {
7239 mov edx, [iBit]
7240# ifdef RT_ARCH_AMD64
7241 mov rax, [pvBitmap]
7242 btc [rax], edx
7243# else
7244 mov eax, [pvBitmap]
7245 btc [eax], edx
7246# endif
7247 setc al
7248 and eax, 1
7249 mov [rc.u32], eax
7250 }
7251# endif
7252
7253# else
7254 int32_t offBitmap = iBit / 32;
7255 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7256 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7257 >> (iBit & 31);
7258 rc.u32 &= 1;
7259# endif
7260 return rc.f;
7261}
7262#endif
7263
7264
7265/**
7266 * Atomically tests and toggles a bit in a bitmap, ordered.
7267 *
7268 * @returns true if the bit was set.
7269 * @returns false if the bit was clear.
7270 *
7271 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7272 * aligned, otherwise the memory access isn't atomic!
7273 * @param iBit The bit to test and toggle.
7274 *
7275 * @remarks x86: Requires a 386 or later.
7276 */
7277#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7278RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7279#else
7280DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7281{
7282 union { bool f; uint32_t u32; uint8_t u8; } rc;
7283 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7284# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7285# if RT_INLINE_ASM_GNU_STYLE
7286 __asm__ __volatile__("lock; btcl %2, %1\n\t"
7287 "setc %b0\n\t"
7288 "andl $1, %0\n\t"
7289 : "=q" (rc.u32)
7290 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7291 : "Ir" (iBit)
7292 , "m" (*(volatile long RT_FAR *)pvBitmap)
7293 : "memory"
7294 , "cc");
7295# else
7296 __asm
7297 {
7298 mov edx, [iBit]
7299# ifdef RT_ARCH_AMD64
7300 mov rax, [pvBitmap]
7301 lock btc [rax], edx
7302# else
7303 mov eax, [pvBitmap]
7304 lock btc [eax], edx
7305# endif
7306 setc al
7307 and eax, 1
7308 mov [rc.u32], eax
7309 }
7310# endif
7311
7312# else
7313 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
7314 >> (iBit & 31);
7315 rc.u32 &= 1;
7316# endif
7317 return rc.f;
7318}
7319#endif
7320
7321
7322/**
7323 * Tests if a bit in a bitmap is set.
7324 *
7325 * @returns true if the bit is set.
7326 * @returns false if the bit is clear.
7327 *
7328 * @param pvBitmap Pointer to the bitmap (little endian).
7329 * @param iBit The bit to test.
7330 *
7331 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7332 * However, doing so will yield better performance as well as avoiding
7333 * traps accessing the last bits in the bitmap.
7334 */
7335#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7336RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7337#else
7338DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7339{
7340 union { bool f; uint32_t u32; uint8_t u8; } rc;
7341# if RT_INLINE_ASM_USES_INTRIN
7342 rc.u32 = _bittest((long *)pvBitmap, iBit);
7343
7344# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7345# if RT_INLINE_ASM_GNU_STYLE
7346
7347 __asm__ __volatile__("btl %2, %1\n\t"
7348 "setc %b0\n\t"
7349 "andl $1, %0\n\t"
7350 : "=q" (rc.u32)
7351 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7352 , "Ir" (iBit)
7353 : "memory"
7354 , "cc");
7355# else
7356 __asm
7357 {
7358 mov edx, [iBit]
7359# ifdef RT_ARCH_AMD64
7360 mov rax, [pvBitmap]
7361 bt [rax], edx
7362# else
7363 mov eax, [pvBitmap]
7364 bt [eax], edx
7365# endif
7366 setc al
7367 and eax, 1
7368 mov [rc.u32], eax
7369 }
7370# endif
7371
7372# else
7373 int32_t offBitmap = iBit / 32;
7374 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7375 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7376 rc.u32 &= 1;
7377# endif
7378 return rc.f;
7379}
7380#endif
7381
7382
7383#ifdef IPRT_INCLUDED_asm_mem_h
7384
7385/**
7386 * Clears a bit range within a bitmap.
7387 *
7388 * @param pvBitmap Pointer to the bitmap (little endian).
7389 * @param iBitStart The First bit to clear.
7390 * @param iBitEnd The first bit not to clear.
7391 */
7392DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7393{
7394 if (iBitStart < iBitEnd)
7395 {
7396 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7397 size_t iStart = iBitStart & ~(size_t)31;
7398 size_t iEnd = iBitEnd & ~(size_t)31;
7399 if (iStart == iEnd)
7400 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7401 else
7402 {
7403 /* bits in first dword. */
7404 if (iBitStart & 31)
7405 {
7406 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7407 pu32++;
7408 iBitStart = iStart + 32;
7409 }
7410
7411 /* whole dwords. */
7412 if (iBitStart != iEnd)
7413 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7414
7415 /* bits in last dword. */
7416 if (iBitEnd & 31)
7417 {
7418 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7419 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7420 }
7421 }
7422 }
7423}
7424
7425
7426/**
7427 * Sets a bit range within a bitmap.
7428 *
7429 * @param pvBitmap Pointer to the bitmap (little endian).
7430 * @param iBitStart The First bit to set.
7431 * @param iBitEnd The first bit not to set.
7432 */
7433DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7434{
7435 if (iBitStart < iBitEnd)
7436 {
7437 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7438 size_t iStart = iBitStart & ~(size_t)31;
7439 size_t iEnd = iBitEnd & ~(size_t)31;
7440 if (iStart == iEnd)
7441 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7442 else
7443 {
7444 /* bits in first dword. */
7445 if (iBitStart & 31)
7446 {
7447 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7448 pu32++;
7449 iBitStart = iStart + 32;
7450 }
7451
7452 /* whole dword. */
7453 if (iBitStart != iEnd)
7454 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7455
7456 /* bits in last dword. */
7457 if (iBitEnd & 31)
7458 {
7459 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7460 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7461 }
7462 }
7463 }
7464}
7465
7466#endif /* IPRT_INCLUDED_asm_mem_h */
7467
7468/**
7469 * Finds the first clear bit in a bitmap.
7470 *
7471 * @returns Index of the first zero bit.
7472 * @returns -1 if no clear bit was found.
7473 * @param pvBitmap Pointer to the bitmap (little endian).
7474 * @param cBits The number of bits in the bitmap. Multiple of 32.
7475 */
7476#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7477DECLASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7478#else
7479DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7480{
7481 if (cBits)
7482 {
7483 int32_t iBit;
7484# if RT_INLINE_ASM_GNU_STYLE
7485 RTCCUINTREG uEAX, uECX, uEDI;
7486 cBits = RT_ALIGN_32(cBits, 32);
7487 __asm__ __volatile__("repe; scasl\n\t"
7488 "je 1f\n\t"
7489# ifdef RT_ARCH_AMD64
7490 "lea -4(%%rdi), %%rdi\n\t"
7491 "xorl (%%rdi), %%eax\n\t"
7492 "subq %5, %%rdi\n\t"
7493# else
7494 "lea -4(%%edi), %%edi\n\t"
7495 "xorl (%%edi), %%eax\n\t"
7496 "subl %5, %%edi\n\t"
7497# endif
7498 "shll $3, %%edi\n\t"
7499 "bsfl %%eax, %%edx\n\t"
7500 "addl %%edi, %%edx\n\t"
7501 "1:\t\n"
7502 : "=d" (iBit)
7503 , "=&c" (uECX)
7504 , "=&D" (uEDI)
7505 , "=&a" (uEAX)
7506 : "0" (0xffffffff)
7507 , "mr" (pvBitmap)
7508 , "1" (cBits >> 5)
7509 , "2" (pvBitmap)
7510 , "3" (0xffffffff)
7511 : "cc");
7512# else
7513 cBits = RT_ALIGN_32(cBits, 32);
7514 __asm
7515 {
7516# ifdef RT_ARCH_AMD64
7517 mov rdi, [pvBitmap]
7518 mov rbx, rdi
7519# else
7520 mov edi, [pvBitmap]
7521 mov ebx, edi
7522# endif
7523 mov edx, 0ffffffffh
7524 mov eax, edx
7525 mov ecx, [cBits]
7526 shr ecx, 5
7527 repe scasd
7528 je done
7529
7530# ifdef RT_ARCH_AMD64
7531 lea rdi, [rdi - 4]
7532 xor eax, [rdi]
7533 sub rdi, rbx
7534# else
7535 lea edi, [edi - 4]
7536 xor eax, [edi]
7537 sub edi, ebx
7538# endif
7539 shl edi, 3
7540 bsf edx, eax
7541 add edx, edi
7542 done:
7543 mov [iBit], edx
7544 }
7545# endif
7546 return iBit;
7547 }
7548 return -1;
7549}
7550#endif
7551
7552
7553/**
7554 * Finds the next clear bit in a bitmap.
7555 *
7556 * @returns Index of the first zero bit.
7557 * @returns -1 if no clear bit was found.
7558 * @param pvBitmap Pointer to the bitmap (little endian).
7559 * @param cBits The number of bits in the bitmap. Multiple of 32.
7560 * @param iBitPrev The bit returned from the last search.
7561 * The search will start at iBitPrev + 1.
7562 */
7563#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7564DECLASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7565#else
7566DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7567{
7568 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7569 int iBit = ++iBitPrev & 31;
7570 if (iBit)
7571 {
7572 /*
7573 * Inspect the 32-bit word containing the unaligned bit.
7574 */
7575 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7576
7577# if RT_INLINE_ASM_USES_INTRIN
7578 unsigned long ulBit = 0;
7579 if (_BitScanForward(&ulBit, u32))
7580 return ulBit + iBitPrev;
7581# else
7582# if RT_INLINE_ASM_GNU_STYLE
7583 __asm__ __volatile__("bsf %1, %0\n\t"
7584 "jnz 1f\n\t"
7585 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7586 "1:\n\t"
7587 : "=r" (iBit)
7588 : "r" (u32)
7589 : "cc");
7590# else
7591 __asm
7592 {
7593 mov edx, [u32]
7594 bsf eax, edx
7595 jnz done
7596 mov eax, 0ffffffffh
7597 done:
7598 mov [iBit], eax
7599 }
7600# endif
7601 if (iBit >= 0)
7602 return iBit + (int)iBitPrev;
7603# endif
7604
7605 /*
7606 * Skip ahead and see if there is anything left to search.
7607 */
7608 iBitPrev |= 31;
7609 iBitPrev++;
7610 if (cBits <= (uint32_t)iBitPrev)
7611 return -1;
7612 }
7613
7614 /*
7615 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7616 */
7617 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7618 if (iBit >= 0)
7619 iBit += iBitPrev;
7620 return iBit;
7621}
7622#endif
7623
7624
7625/**
7626 * Finds the first set bit in a bitmap.
7627 *
7628 * @returns Index of the first set bit.
7629 * @returns -1 if no clear bit was found.
7630 * @param pvBitmap Pointer to the bitmap (little endian).
7631 * @param cBits The number of bits in the bitmap. Multiple of 32.
7632 */
7633#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7634DECLASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7635#else
7636DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7637{
7638 if (cBits)
7639 {
7640 int32_t iBit;
7641# if RT_INLINE_ASM_GNU_STYLE
7642 RTCCUINTREG uEAX, uECX, uEDI;
7643 cBits = RT_ALIGN_32(cBits, 32);
7644 __asm__ __volatile__("repe; scasl\n\t"
7645 "je 1f\n\t"
7646# ifdef RT_ARCH_AMD64
7647 "lea -4(%%rdi), %%rdi\n\t"
7648 "movl (%%rdi), %%eax\n\t"
7649 "subq %5, %%rdi\n\t"
7650# else
7651 "lea -4(%%edi), %%edi\n\t"
7652 "movl (%%edi), %%eax\n\t"
7653 "subl %5, %%edi\n\t"
7654# endif
7655 "shll $3, %%edi\n\t"
7656 "bsfl %%eax, %%edx\n\t"
7657 "addl %%edi, %%edx\n\t"
7658 "1:\t\n"
7659 : "=d" (iBit)
7660 , "=&c" (uECX)
7661 , "=&D" (uEDI)
7662 , "=&a" (uEAX)
7663 : "0" (0xffffffff)
7664 , "mr" (pvBitmap)
7665 , "1" (cBits >> 5)
7666 , "2" (pvBitmap)
7667 , "3" (0)
7668 : "cc");
7669# else
7670 cBits = RT_ALIGN_32(cBits, 32);
7671 __asm
7672 {
7673# ifdef RT_ARCH_AMD64
7674 mov rdi, [pvBitmap]
7675 mov rbx, rdi
7676# else
7677 mov edi, [pvBitmap]
7678 mov ebx, edi
7679# endif
7680 mov edx, 0ffffffffh
7681 xor eax, eax
7682 mov ecx, [cBits]
7683 shr ecx, 5
7684 repe scasd
7685 je done
7686# ifdef RT_ARCH_AMD64
7687 lea rdi, [rdi - 4]
7688 mov eax, [rdi]
7689 sub rdi, rbx
7690# else
7691 lea edi, [edi - 4]
7692 mov eax, [edi]
7693 sub edi, ebx
7694# endif
7695 shl edi, 3
7696 bsf edx, eax
7697 add edx, edi
7698 done:
7699 mov [iBit], edx
7700 }
7701# endif
7702 return iBit;
7703 }
7704 return -1;
7705}
7706#endif
7707
7708
7709/**
7710 * Finds the next set bit in a bitmap.
7711 *
7712 * @returns Index of the next set bit.
7713 * @returns -1 if no set bit was found.
7714 * @param pvBitmap Pointer to the bitmap (little endian).
7715 * @param cBits The number of bits in the bitmap. Multiple of 32.
7716 * @param iBitPrev The bit returned from the last search.
7717 * The search will start at iBitPrev + 1.
7718 */
7719#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7720DECLASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7721#else
7722DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7723{
7724 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7725 int iBit = ++iBitPrev & 31;
7726 if (iBit)
7727 {
7728 /*
7729 * Inspect the 32-bit word containing the unaligned bit.
7730 */
7731 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
7732
7733# if RT_INLINE_ASM_USES_INTRIN
7734 unsigned long ulBit = 0;
7735 if (_BitScanForward(&ulBit, u32))
7736 return ulBit + iBitPrev;
7737# else
7738# if RT_INLINE_ASM_GNU_STYLE
7739 __asm__ __volatile__("bsf %1, %0\n\t"
7740 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
7741 "movl $-1, %0\n\t"
7742 "1:\n\t"
7743 : "=r" (iBit)
7744 : "r" (u32)
7745 : "cc");
7746# else
7747 __asm
7748 {
7749 mov edx, [u32]
7750 bsf eax, edx
7751 jnz done
7752 mov eax, 0ffffffffh
7753 done:
7754 mov [iBit], eax
7755 }
7756# endif
7757 if (iBit >= 0)
7758 return iBit + (int)iBitPrev;
7759# endif
7760
7761 /*
7762 * Skip ahead and see if there is anything left to search.
7763 */
7764 iBitPrev |= 31;
7765 iBitPrev++;
7766 if (cBits <= (uint32_t)iBitPrev)
7767 return -1;
7768 }
7769
7770 /*
7771 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7772 */
7773 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7774 if (iBit >= 0)
7775 iBit += iBitPrev;
7776 return iBit;
7777}
7778#endif
7779
7780/** @} */
7781
7782
7783/** @defgroup grp_inline_bits Bitmap Operations
7784 * @{
7785 */
7786
7787/**
7788 * Finds the first bit which is set in the given 32-bit integer.
7789 * Bits are numbered from 1 (least significant) to 32.
7790 *
7791 * @returns index [1..32] of the first set bit.
7792 * @returns 0 if all bits are cleared.
7793 * @param u32 Integer to search for set bits.
7794 * @remarks Similar to ffs() in BSD.
7795 */
7796#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7797RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7798#else
7799DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
7800{
7801# if RT_INLINE_ASM_USES_INTRIN
7802 unsigned long iBit;
7803 if (_BitScanForward(&iBit, u32))
7804 iBit++;
7805 else
7806 iBit = 0;
7807
7808# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7809# if RT_INLINE_ASM_GNU_STYLE
7810 uint32_t iBit;
7811 __asm__ __volatile__("bsf %1, %0\n\t"
7812 "jnz 1f\n\t"
7813 "xorl %0, %0\n\t"
7814 "jmp 2f\n"
7815 "1:\n\t"
7816 "incl %0\n"
7817 "2:\n\t"
7818 : "=r" (iBit)
7819 : "rm" (u32)
7820 : "cc");
7821# else
7822 uint32_t iBit;
7823 _asm
7824 {
7825 bsf eax, [u32]
7826 jnz found
7827 xor eax, eax
7828 jmp done
7829 found:
7830 inc eax
7831 done:
7832 mov [iBit], eax
7833 }
7834# endif
7835
7836# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
7837 /*
7838 * Using the "count leading zeros (clz)" instruction here because there
7839 * is no dedicated instruction to get the first set bit.
7840 * Need to reverse the bits in the value with "rbit" first because
7841 * "clz" starts counting from the most significant bit.
7842 */
7843 uint32_t iBit;
7844 __asm__ __volatile__(
7845# if defined(RT_ARCH_ARM64)
7846 "rbit %w[uVal], %w[uVal]\n\t"
7847 "clz %w[iBit], %w[uVal]\n\t"
7848# else
7849 "rbit %[uVal], %[uVal]\n\t"
7850 "clz %[iBit], %[uVal]\n\t"
7851# endif
7852 : [uVal] "=r" (u32)
7853 , [iBit] "=r" (iBit)
7854 : "[uVal]" (u32));
7855 if (iBit != 32)
7856 iBit++;
7857 else
7858 iBit = 0; /* No bit set. */
7859
7860# else
7861# error "Port me"
7862# endif
7863 return iBit;
7864}
7865#endif
7866
7867
7868/**
7869 * Finds the first bit which is set in the given 32-bit integer.
7870 * Bits are numbered from 1 (least significant) to 32.
7871 *
7872 * @returns index [1..32] of the first set bit.
7873 * @returns 0 if all bits are cleared.
7874 * @param i32 Integer to search for set bits.
7875 * @remark Similar to ffs() in BSD.
7876 */
7877DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
7878{
7879 return ASMBitFirstSetU32((uint32_t)i32);
7880}
7881
7882
7883/**
7884 * Finds the first bit which is set in the given 64-bit integer.
7885 *
7886 * Bits are numbered from 1 (least significant) to 64.
7887 *
7888 * @returns index [1..64] of the first set bit.
7889 * @returns 0 if all bits are cleared.
7890 * @param u64 Integer to search for set bits.
7891 * @remarks Similar to ffs() in BSD.
7892 */
7893#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7894RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
7895#else
7896DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
7897{
7898# if RT_INLINE_ASM_USES_INTRIN
7899 unsigned long iBit;
7900# if ARCH_BITS == 64
7901 if (_BitScanForward64(&iBit, u64))
7902 iBit++;
7903 else
7904 iBit = 0;
7905# else
7906 if (_BitScanForward(&iBit, (uint32_t)u64))
7907 iBit++;
7908 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
7909 iBit += 33;
7910 else
7911 iBit = 0;
7912# endif
7913
7914# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
7915 uint64_t iBit;
7916 __asm__ __volatile__("bsfq %1, %0\n\t"
7917 "jnz 1f\n\t"
7918 "xorl %k0, %k0\n\t"
7919 "jmp 2f\n"
7920 "1:\n\t"
7921 "incl %k0\n"
7922 "2:\n\t"
7923 : "=r" (iBit)
7924 : "rm" (u64)
7925 : "cc");
7926
7927# elif defined(RT_ARCH_ARM64)
7928 uint64_t iBit;
7929 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
7930 "clz %[iBit], %[uVal]\n\t"
7931 : [uVal] "=r" (u64)
7932 , [iBit] "=r" (iBit)
7933 : "[uVal]" (u64));
7934 if (iBit != 64)
7935 iBit++;
7936 else
7937 iBit = 0; /* No bit set. */
7938
7939# else
7940 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
7941 if (!iBit)
7942 {
7943 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
7944 if (iBit)
7945 iBit += 32;
7946 }
7947# endif
7948 return (unsigned)iBit;
7949}
7950#endif
7951
7952
7953/**
7954 * Finds the first bit which is set in the given 16-bit integer.
7955 *
7956 * Bits are numbered from 1 (least significant) to 16.
7957 *
7958 * @returns index [1..16] of the first set bit.
7959 * @returns 0 if all bits are cleared.
7960 * @param u16 Integer to search for set bits.
7961 * @remarks For 16-bit bs3kit code.
7962 */
7963#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7964RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
7965#else
7966DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
7967{
7968 return ASMBitFirstSetU32((uint32_t)u16);
7969}
7970#endif
7971
7972
7973/**
7974 * Finds the last bit which is set in the given 32-bit integer.
7975 * Bits are numbered from 1 (least significant) to 32.
7976 *
7977 * @returns index [1..32] of the last set bit.
7978 * @returns 0 if all bits are cleared.
7979 * @param u32 Integer to search for set bits.
7980 * @remark Similar to fls() in BSD.
7981 */
7982#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7983RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
7984#else
7985DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
7986{
7987# if RT_INLINE_ASM_USES_INTRIN
7988 unsigned long iBit;
7989 if (_BitScanReverse(&iBit, u32))
7990 iBit++;
7991 else
7992 iBit = 0;
7993
7994# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7995# if RT_INLINE_ASM_GNU_STYLE
7996 uint32_t iBit;
7997 __asm__ __volatile__("bsrl %1, %0\n\t"
7998 "jnz 1f\n\t"
7999 "xorl %0, %0\n\t"
8000 "jmp 2f\n"
8001 "1:\n\t"
8002 "incl %0\n"
8003 "2:\n\t"
8004 : "=r" (iBit)
8005 : "rm" (u32)
8006 : "cc");
8007# else
8008 uint32_t iBit;
8009 _asm
8010 {
8011 bsr eax, [u32]
8012 jnz found
8013 xor eax, eax
8014 jmp done
8015 found:
8016 inc eax
8017 done:
8018 mov [iBit], eax
8019 }
8020# endif
8021
8022# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8023 uint32_t iBit;
8024 __asm__ __volatile__(
8025# if defined(RT_ARCH_ARM64)
8026 "clz %w[iBit], %w[uVal]\n\t"
8027# else
8028 "clz %[iBit], %[uVal]\n\t"
8029# endif
8030 : [iBit] "=r" (iBit)
8031 : [uVal] "r" (u32));
8032 iBit = 32 - iBit;
8033
8034# else
8035# error "Port me"
8036# endif
8037 return iBit;
8038}
8039#endif
8040
8041
8042/**
8043 * Finds the last bit which is set in the given 32-bit integer.
8044 * Bits are numbered from 1 (least significant) to 32.
8045 *
8046 * @returns index [1..32] of the last set bit.
8047 * @returns 0 if all bits are cleared.
8048 * @param i32 Integer to search for set bits.
8049 * @remark Similar to fls() in BSD.
8050 */
8051DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
8052{
8053 return ASMBitLastSetU32((uint32_t)i32);
8054}
8055
8056
8057/**
8058 * Finds the last bit which is set in the given 64-bit integer.
8059 *
8060 * Bits are numbered from 1 (least significant) to 64.
8061 *
8062 * @returns index [1..64] of the last set bit.
8063 * @returns 0 if all bits are cleared.
8064 * @param u64 Integer to search for set bits.
8065 * @remark Similar to fls() in BSD.
8066 */
8067#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8068RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8069#else
8070DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
8071{
8072# if RT_INLINE_ASM_USES_INTRIN
8073 unsigned long iBit;
8074# if ARCH_BITS == 64
8075 if (_BitScanReverse64(&iBit, u64))
8076 iBit++;
8077 else
8078 iBit = 0;
8079# else
8080 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8081 iBit += 33;
8082 else if (_BitScanReverse(&iBit, (uint32_t)u64))
8083 iBit++;
8084 else
8085 iBit = 0;
8086# endif
8087
8088# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8089 uint64_t iBit;
8090 __asm__ __volatile__("bsrq %1, %0\n\t"
8091 "jnz 1f\n\t"
8092 "xorl %k0, %k0\n\t"
8093 "jmp 2f\n"
8094 "1:\n\t"
8095 "incl %k0\n"
8096 "2:\n\t"
8097 : "=r" (iBit)
8098 : "rm" (u64)
8099 : "cc");
8100
8101# elif defined(RT_ARCH_ARM64)
8102 uint64_t iBit;
8103 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8104 : [iBit] "=r" (iBit)
8105 : [uVal] "r" (u64));
8106 iBit = 64 - iBit;
8107
8108# else
8109 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
8110 if (iBit)
8111 iBit += 32;
8112 else
8113 iBit = ASMBitLastSetU32((uint32_t)u64);
8114# endif
8115 return (unsigned)iBit;
8116}
8117#endif
8118
8119
8120/**
8121 * Finds the last bit which is set in the given 16-bit integer.
8122 *
8123 * Bits are numbered from 1 (least significant) to 16.
8124 *
8125 * @returns index [1..16] of the last set bit.
8126 * @returns 0 if all bits are cleared.
8127 * @param u16 Integer to search for set bits.
8128 * @remarks For 16-bit bs3kit code.
8129 */
8130#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8131RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8132#else
8133DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
8134{
8135 return ASMBitLastSetU32((uint32_t)u16);
8136}
8137#endif
8138
8139
8140/**
8141 * Count the number of leading zero bits in the given 32-bit integer.
8142 *
8143 * The counting starts with the most significate bit.
8144 *
8145 * @returns Number of most significant zero bits.
8146 * @returns 32 if all bits are cleared.
8147 * @param u32 Integer to consider.
8148 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
8149 */
8150#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8151RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8152#else
8153DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8154{
8155# if RT_INLINE_ASM_USES_INTRIN
8156 unsigned long iBit;
8157 if (!_BitScanReverse(&iBit, u32))
8158 return 32;
8159 return 31 - (unsigned)iBit;
8160
8161# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8162 uint32_t iBit;
8163# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
8164 __asm__ __volatile__("bsrl %1, %0\n\t"
8165 "cmovzl %2, %0\n\t"
8166 : "=&r" (iBit)
8167 : "rm" (u32)
8168 , "rm" ((int32_t)-1)
8169 : "cc");
8170# elif RT_INLINE_ASM_GNU_STYLE
8171 __asm__ __volatile__("bsr %1, %0\n\t"
8172 "jnz 1f\n\t"
8173 "mov $-1, %0\n\t"
8174 "1:\n\t"
8175 : "=r" (iBit)
8176 : "rm" (u32)
8177 : "cc");
8178# else
8179 _asm
8180 {
8181 bsr eax, [u32]
8182 jnz found
8183 mov eax, -1
8184 found:
8185 mov [iBit], eax
8186 }
8187# endif
8188 return 31 - iBit;
8189
8190# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8191 uint32_t iBit;
8192 __asm__ __volatile__(
8193# if defined(RT_ARCH_ARM64)
8194 "clz %w[iBit], %w[uVal]\n\t"
8195# else
8196 "clz %[iBit], %[uVal]\n\t"
8197# endif
8198 : [uVal] "=r" (u32)
8199 , [iBit] "=r" (iBit)
8200 : "[uVal]" (u32));
8201 return iBit;
8202
8203# elif defined(__GNUC__)
8204 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8205 return u32 ? __builtin_clz(u32) : 32;
8206
8207# else
8208# error "Port me"
8209# endif
8210}
8211#endif
8212
8213
8214/**
8215 * Count the number of leading zero bits in the given 64-bit integer.
8216 *
8217 * The counting starts with the most significate bit.
8218 *
8219 * @returns Number of most significant zero bits.
8220 * @returns 64 if all bits are cleared.
8221 * @param u64 Integer to consider.
8222 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
8223 * result.
8224 */
8225#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8226RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8227#else
8228DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8229{
8230# if RT_INLINE_ASM_USES_INTRIN
8231 unsigned long iBit;
8232# if ARCH_BITS == 64
8233 if (_BitScanReverse64(&iBit, u64))
8234 return 63 - (unsigned)iBit;
8235# else
8236 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8237 return 31 - (unsigned)iBit;
8238 if (_BitScanReverse(&iBit, (uint32_t)u64))
8239 return 63 - (unsigned)iBit;
8240# endif
8241 return 64;
8242
8243# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8244 uint64_t iBit;
8245# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8246 __asm__ __volatile__("bsrq %1, %0\n\t"
8247 "cmovzq %2, %0\n\t"
8248 : "=&r" (iBit)
8249 : "rm" (u64)
8250 , "rm" ((int64_t)-1)
8251 : "cc");
8252# else /* 10980xe benchmark: 262 ps/call */
8253 __asm__ __volatile__("bsrq %1, %0\n\t"
8254 "jnz 1f\n\t"
8255 "mov $-1, %0\n\t"
8256 "1:\n\t"
8257 : "=&r" (iBit)
8258 : "rm" (u64)
8259 : "cc");
8260# endif
8261 return 63 - (unsigned)iBit;
8262
8263# elif defined(RT_ARCH_ARM64)
8264 uint64_t iBit;
8265 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8266 : [uVal] "=r" (u64)
8267 , [iBit] "=r" (iBit)
8268 : "[uVal]" (u64));
8269 return (unsigned)iBit;
8270
8271# elif defined(__GNUC__) && ARCH_BITS == 64
8272 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8273 return u64 ? __builtin_clzl(u64) : 64;
8274
8275# else
8276 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
8277 if (iBit == 32)
8278 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
8279 return iBit;
8280# endif
8281}
8282#endif
8283
8284
8285/**
8286 * Count the number of leading zero bits in the given 16-bit integer.
8287 *
8288 * The counting starts with the most significate bit.
8289 *
8290 * @returns Number of most significant zero bits.
8291 * @returns 16 if all bits are cleared.
8292 * @param u16 Integer to consider.
8293 */
8294#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8295RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8296#else
8297DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8298{
8299# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
8300 uint16_t iBit;
8301 __asm__ __volatile__("bsrw %1, %0\n\t"
8302 "jnz 1f\n\t"
8303 "mov $-1, %0\n\t"
8304 "1:\n\t"
8305 : "=r" (iBit)
8306 : "rm" (u16)
8307 : "cc");
8308 return 15 - (int16_t)iBit;
8309# else
8310 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
8311# endif
8312}
8313#endif
8314
8315
8316/**
8317 * Count the number of trailing zero bits in the given 32-bit integer.
8318 *
8319 * The counting starts with the least significate bit, i.e. the zero bit.
8320 *
8321 * @returns Number of least significant zero bits.
8322 * @returns 32 if all bits are cleared.
8323 * @param u32 Integer to consider.
8324 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8325 */
8326#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8327RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8328#else
8329DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8330{
8331# if RT_INLINE_ASM_USES_INTRIN
8332 unsigned long iBit;
8333 if (!_BitScanForward(&iBit, u32))
8334 return 32;
8335 return (unsigned)iBit;
8336
8337# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8338 uint32_t iBit;
8339# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8340 __asm__ __volatile__("bsfl %1, %0\n\t"
8341 "cmovzl %2, %0\n\t"
8342 : "=&r" (iBit)
8343 : "rm" (u32)
8344 , "rm" ((int32_t)32)
8345 : "cc");
8346# elif RT_INLINE_ASM_GNU_STYLE
8347 __asm__ __volatile__("bsfl %1, %0\n\t"
8348 "jnz 1f\n\t"
8349 "mov $32, %0\n\t"
8350 "1:\n\t"
8351 : "=r" (iBit)
8352 : "rm" (u32)
8353 : "cc");
8354# else
8355 _asm
8356 {
8357 bsf eax, [u32]
8358 jnz found
8359 mov eax, 32
8360 found:
8361 mov [iBit], eax
8362 }
8363# endif
8364 return iBit;
8365
8366# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8367 /* Invert the bits and use clz. */
8368 uint32_t iBit;
8369 __asm__ __volatile__(
8370# if defined(RT_ARCH_ARM64)
8371 "rbit %w[uVal], %w[uVal]\n\t"
8372 "clz %w[iBit], %w[uVal]\n\t"
8373# else
8374 "rbit %[uVal], %[uVal]\n\t"
8375 "clz %[iBit], %[uVal]\n\t"
8376# endif
8377 : [uVal] "=r" (u32)
8378 , [iBit] "=r" (iBit)
8379 : "[uVal]" (u32));
8380 return iBit;
8381
8382# elif defined(__GNUC__)
8383 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8384 return u32 ? __builtin_ctz(u32) : 32;
8385
8386# else
8387# error "Port me"
8388# endif
8389}
8390#endif
8391
8392
8393/**
8394 * Count the number of trailing zero bits in the given 64-bit integer.
8395 *
8396 * The counting starts with the least significate bit.
8397 *
8398 * @returns Number of least significant zero bits.
8399 * @returns 64 if all bits are cleared.
8400 * @param u64 Integer to consider.
8401 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8402 * result.
8403 */
8404#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8405RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8406#else
8407DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8408{
8409# if RT_INLINE_ASM_USES_INTRIN
8410 unsigned long iBit;
8411# if ARCH_BITS == 64
8412 if (_BitScanForward64(&iBit, u64))
8413 return (unsigned)iBit;
8414# else
8415 if (_BitScanForward(&iBit, (uint32_t)u64))
8416 return (unsigned)iBit;
8417 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8418 return (unsigned)iBit + 32;
8419# endif
8420 return 64;
8421
8422# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8423 uint64_t iBit;
8424# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8425 __asm__ __volatile__("bsfq %1, %0\n\t"
8426 "cmovzq %2, %0\n\t"
8427 : "=&r" (iBit)
8428 : "rm" (u64)
8429 , "rm" ((int64_t)64)
8430 : "cc");
8431# else /* 10980xe benchmark: 262 ps/call */
8432 __asm__ __volatile__("bsfq %1, %0\n\t"
8433 "jnz 1f\n\t"
8434 "mov $64, %0\n\t"
8435 "1:\n\t"
8436 : "=&r" (iBit)
8437 : "rm" (u64)
8438 : "cc");
8439# endif
8440 return (unsigned)iBit;
8441
8442# elif defined(RT_ARCH_ARM64)
8443 /* Invert the bits and use clz. */
8444 uint64_t iBit;
8445 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8446 "clz %[iBit], %[uVal]\n\t"
8447 : [uVal] "=r" (u64)
8448 , [iBit] "=r" (iBit)
8449 : "[uVal]" (u64));
8450 return (unsigned)iBit;
8451
8452# elif defined(__GNUC__) && ARCH_BITS == 64
8453 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8454 return u64 ? __builtin_ctzl(u64) : 64;
8455
8456# else
8457 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8458 if (iBit == 32)
8459 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8460 return iBit;
8461# endif
8462}
8463#endif
8464
8465
8466/**
8467 * Count the number of trailing zero bits in the given 16-bit integer.
8468 *
8469 * The counting starts with the most significate bit.
8470 *
8471 * @returns Number of most significant zero bits.
8472 * @returns 16 if all bits are cleared.
8473 * @param u16 Integer to consider.
8474 */
8475#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8476RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8477#else
8478DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8479{
8480# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8481 uint16_t iBit;
8482 __asm__ __volatile__("bsfw %1, %0\n\t"
8483 "jnz 1f\n\t"
8484 "mov $16, %0\n\t"
8485 "1:\n\t"
8486 : "=r" (iBit)
8487 : "rm" (u16)
8488 : "cc");
8489 return iBit;
8490# else
8491 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8492#endif
8493}
8494#endif
8495
8496
8497/**
8498 * Rotate 32-bit unsigned value to the left by @a cShift.
8499 *
8500 * @returns Rotated value.
8501 * @param u32 The value to rotate.
8502 * @param cShift How many bits to rotate by.
8503 */
8504#ifdef __WATCOMC__
8505RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8506#else
8507DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8508{
8509# if RT_INLINE_ASM_USES_INTRIN
8510 return _rotl(u32, cShift);
8511
8512# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8513 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8514 return u32;
8515
8516# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8517 __asm__ __volatile__(
8518# if defined(RT_ARCH_ARM64)
8519 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8520# else
8521 "ror %[uRet], %[uVal], %[cShift]\n\t"
8522# endif
8523 : [uRet] "=r" (u32)
8524 : [uVal] "[uRet]" (u32)
8525 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8526 return u32;
8527
8528# else
8529 cShift &= 31;
8530 return (u32 << cShift) | (u32 >> (32 - cShift));
8531# endif
8532}
8533#endif
8534
8535
8536/**
8537 * Rotate 32-bit unsigned value to the right by @a cShift.
8538 *
8539 * @returns Rotated value.
8540 * @param u32 The value to rotate.
8541 * @param cShift How many bits to rotate by.
8542 */
8543#ifdef __WATCOMC__
8544RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8545#else
8546DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8547{
8548# if RT_INLINE_ASM_USES_INTRIN
8549 return _rotr(u32, cShift);
8550
8551# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8552 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8553 return u32;
8554
8555# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8556 __asm__ __volatile__(
8557# if defined(RT_ARCH_ARM64)
8558 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8559# else
8560 "ror %[uRet], %[uVal], %[cShift]\n\t"
8561# endif
8562 : [uRet] "=r" (u32)
8563 : [uVal] "[uRet]" (u32)
8564 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8565 return u32;
8566
8567# else
8568 cShift &= 31;
8569 return (u32 >> cShift) | (u32 << (32 - cShift));
8570# endif
8571}
8572#endif
8573
8574
8575/**
8576 * Rotate 64-bit unsigned value to the left by @a cShift.
8577 *
8578 * @returns Rotated value.
8579 * @param u64 The value to rotate.
8580 * @param cShift How many bits to rotate by.
8581 */
8582DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8583{
8584#if RT_INLINE_ASM_USES_INTRIN
8585 return _rotl64(u64, cShift);
8586
8587#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8588 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8589 return u64;
8590
8591#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8592 uint32_t uSpill;
8593 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8594 "jz 1f\n\t"
8595 "xchgl %%eax, %%edx\n\t"
8596 "1:\n\t"
8597 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8598 "jz 2f\n\t"
8599 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8600 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8601 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8602 "2:\n\t" /* } */
8603 : "=A" (u64)
8604 , "=c" (cShift)
8605 , "=r" (uSpill)
8606 : "0" (u64)
8607 , "1" (cShift)
8608 : "cc");
8609 return u64;
8610
8611# elif defined(RT_ARCH_ARM64)
8612 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8613 : [uRet] "=r" (u64)
8614 : [uVal] "[uRet]" (u64)
8615 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8616 return u64;
8617
8618#else
8619 cShift &= 63;
8620 return (u64 << cShift) | (u64 >> (64 - cShift));
8621#endif
8622}
8623
8624
8625/**
8626 * Rotate 64-bit unsigned value to the right by @a cShift.
8627 *
8628 * @returns Rotated value.
8629 * @param u64 The value to rotate.
8630 * @param cShift How many bits to rotate by.
8631 */
8632DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8633{
8634#if RT_INLINE_ASM_USES_INTRIN
8635 return _rotr64(u64, cShift);
8636
8637#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8638 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8639 return u64;
8640
8641#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8642 uint32_t uSpill;
8643 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8644 "jz 1f\n\t"
8645 "xchgl %%eax, %%edx\n\t"
8646 "1:\n\t"
8647 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8648 "jz 2f\n\t"
8649 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8650 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
8651 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
8652 "2:\n\t" /* } */
8653 : "=A" (u64)
8654 , "=c" (cShift)
8655 , "=r" (uSpill)
8656 : "0" (u64)
8657 , "1" (cShift)
8658 : "cc");
8659 return u64;
8660
8661# elif defined(RT_ARCH_ARM64)
8662 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8663 : [uRet] "=r" (u64)
8664 : [uVal] "[uRet]" (u64)
8665 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
8666 return u64;
8667
8668#else
8669 cShift &= 63;
8670 return (u64 >> cShift) | (u64 << (64 - cShift));
8671#endif
8672}
8673
8674/** @} */
8675
8676
8677/** @} */
8678
8679/*
8680 * Include #pragma aux definitions for Watcom C/C++.
8681 */
8682#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
8683# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
8684# undef IPRT_INCLUDED_asm_watcom_x86_16_h
8685# include "asm-watcom-x86-16.h"
8686#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
8687# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
8688# undef IPRT_INCLUDED_asm_watcom_x86_32_h
8689# include "asm-watcom-x86-32.h"
8690#endif
8691
8692#endif /* !IPRT_INCLUDED_asm_h */
8693
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette