VirtualBox

source: vbox/trunk/include/iprt/asm.h@ 106944

Last change on this file since 106944 was 106944, checked in by vboxsync, 3 months ago

iprt/asm.h: Missing '#pragma intrinsic(_byteswap_uint64)' caused unresolved externals in ring-0 code. jiraref:VBP-1449

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 297.9 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
60# pragma intrinsic(__cpuid)
61# pragma intrinsic(__stosd)
62# pragma intrinsic(__stosw)
63# pragma intrinsic(__stosb)
64# ifdef RT_ARCH_AMD64
65# pragma intrinsic(__stosq)
66# pragma intrinsic(_byteswap_uint64)
67# pragma intrinsic(_InterlockedCompareExchange128)
68# pragma intrinsic(_InterlockedExchange64)
69# pragma intrinsic(_InterlockedExchangeAdd64)
70# pragma intrinsic(_InterlockedAnd64)
71# pragma intrinsic(_InterlockedOr64)
72# pragma intrinsic(_InterlockedIncrement64)
73# pragma intrinsic(_InterlockedDecrement64)
74# endif
75# elif defined(RT_ARCH_ARM64)
76# pragma intrinsic(_byteswap_uint64)
77# pragma intrinsic(__break)
78# pragma intrinsic(__dmb)
79# pragma intrinsic(__dsb)
80# pragma intrinsic(__isb)
81# pragma intrinsic(__nop)
82# pragma intrinsic(__yield)
83# pragma intrinsic(__swp8)
84# pragma intrinsic(__swpa8)
85# pragma intrinsic(__swpal8)
86# pragma intrinsic(__swp16)
87# pragma intrinsic(__swpa16)
88# pragma intrinsic(__swpal16)
89# pragma intrinsic(__swp32)
90# pragma intrinsic(__swpa32)
91# pragma intrinsic(__swpal32)
92# pragma intrinsic(__swp64)
93# pragma intrinsic(__swpa64)
94# pragma intrinsic(__swpal64)
95# pragma intrinsic(__cas8)
96# pragma intrinsic(__casl8)
97# pragma intrinsic(__cas16)
98# pragma intrinsic(__casl16)
99# pragma intrinsic(__cas32)
100# pragma intrinsic(__casl32)
101# pragma intrinsic(__cas64)
102# pragma intrinsic(__casl64)
103# pragma intrinsic(__casa8)
104# pragma intrinsic(__casal8)
105# pragma intrinsic(__casa16)
106# pragma intrinsic(__casa64)
107# pragma intrinsic(__iso_volatile_load8)
108# pragma intrinsic(__iso_volatile_load16)
109# pragma intrinsic(__iso_volatile_load32)
110# pragma intrinsic(__iso_volatile_load64)
111# pragma intrinsic(__iso_volatile_store8)
112# pragma intrinsic(__iso_volatile_store16)
113# pragma intrinsic(__iso_volatile_store32)
114# pragma intrinsic(__iso_volatile_store64)
115# pragma intrinsic(__load_acquire8)
116# pragma intrinsic(__load_acquire16)
117# pragma intrinsic(__load_acquire32)
118# pragma intrinsic(__load_acquire64)
119# pragma intrinsic(__stlr8)
120# pragma intrinsic(__stlr16)
121# pragma intrinsic(__stlr32)
122# pragma intrinsic(__stlr64)
123# else
124# error "Port me"
125# endif
126# pragma intrinsic(_BitScanForward)
127# pragma intrinsic(_BitScanReverse)
128# pragma intrinsic(_bittest)
129# pragma intrinsic(_bittestandset)
130# pragma intrinsic(_bittestandreset)
131# pragma intrinsic(_bittestandcomplement)
132# pragma intrinsic(_byteswap_ushort)
133# pragma intrinsic(_byteswap_ulong)
134# pragma intrinsic(_interlockedbittestandset)
135# pragma intrinsic(_interlockedbittestandreset)
136# pragma intrinsic(_InterlockedAnd)
137# pragma intrinsic(_InterlockedOr)
138# pragma intrinsic(_InterlockedXor)
139# pragma intrinsic(_InterlockedIncrement)
140# pragma intrinsic(_InterlockedDecrement)
141# pragma intrinsic(_InterlockedExchange)
142# pragma intrinsic(_InterlockedExchangeAdd)
143# pragma intrinsic(_InterlockedCompareExchange)
144# pragma intrinsic(_InterlockedCompareExchange8)
145# pragma intrinsic(_InterlockedCompareExchange16)
146# pragma intrinsic(_InterlockedCompareExchange64)
147# pragma intrinsic(_rotl)
148# pragma intrinsic(_rotr)
149# pragma intrinsic(_rotl64)
150# pragma intrinsic(_rotr64)
151#endif
152
153#if (defined(RT_ARCH_ARM64) && (defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))) || defined(DOXYGEN_RUNNING)
154/** @def RTASM_ARM64_USE_FEAT_LSE
155 * Use instructions from the FEAT_LSE set to implement atomic operations,
156 * assuming that the host CPU always supports these. */
157# define RTASM_ARM64_USE_FEAT_LSE 1
158/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
159 * Set to use DMB w/o barrier in most places and rely on the acquire-release
160 * aspects to do the serializing. The assumption is that the tstRTInline
161 * benchmark may be skewing the results testing an unusual scenario. */
162# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
163#endif
164
165
166/*
167 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
168 */
169#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
170# include "asm-watcom-x86-16.h"
171#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
172# include "asm-watcom-x86-32.h"
173#endif
174
175
176/** @defgroup grp_rt_asm ASM - Assembly Routines
177 * @ingroup grp_rt
178 *
179 * @remarks The difference between ordered and unordered atomic operations are
180 * that the former will complete outstanding reads and writes before
181 * continuing while the latter doesn't make any promises about the
182 * order. Ordered operations doesn't, it seems, make any 100% promise
183 * wrt to whether the operation will complete before any subsequent
184 * memory access. (please, correct if wrong.)
185 *
186 * ASMAtomicSomething operations are all ordered, while
187 * ASMAtomicUoSomething are unordered (note the Uo).
188 *
189 * Please note that ordered operations does not necessarily imply a
190 * compiler (memory) barrier. The user has to use the
191 * ASMCompilerBarrier() macro when that is deemed necessary.
192 *
193 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
194 * to reorder or even optimize assembler instructions away. For
195 * instance, in the following code the second rdmsr instruction is
196 * optimized away because gcc treats that instruction as deterministic:
197 *
198 * @code
199 * static inline uint64_t rdmsr_low(int idx)
200 * {
201 * uint32_t low;
202 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
203 * }
204 * ...
205 * uint32_t msr1 = rdmsr_low(1);
206 * foo(msr1);
207 * msr1 = rdmsr_low(1);
208 * bar(msr1);
209 * @endcode
210 *
211 * The input parameter of rdmsr_low is the same for both calls and
212 * therefore gcc will use the result of the first call as input
213 * parameter for bar() as well. For rdmsr this is not acceptable as
214 * this instruction is _not_ deterministic. This applies to reading
215 * machine status information in general.
216 *
217 * @{
218 */
219
220
221/** @def RT_INLINE_ASM_GCC_4_3_X_X86
222 * Used to work around some 4.3.x register allocation issues in this version of
223 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
224 * definitely not for 5.x */
225#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
226# define RT_INLINE_ASM_GCC_4_3_X_X86 1
227#else
228# define RT_INLINE_ASM_GCC_4_3_X_X86 0
229#endif
230
231/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
232 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
233 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
234 * mode, x86.
235 *
236 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
237 * when in PIC mode on x86.
238 */
239#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
240# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
241# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
242# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
243# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
244# elif ( (defined(PIC) || defined(__PIC__)) \
245 && defined(RT_ARCH_X86) \
246 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
247 || defined(RT_OS_DARWIN)) )
248# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
249# else
250# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
251# endif
252#endif
253
254
255/*
256 * ARM is great fun.
257 */
258#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
259
260# define RTASM_ARM_NO_BARRIER
261# ifdef RT_ARCH_ARM64
262# define RTASM_ARM_NO_BARRIER_IN_REG
263# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
264# define RTASM_ARM_DSB_SY "dsb sy\n\t"
265# define RTASM_ARM_DSB_SY_IN_REG
266# define RTASM_ARM_DSB_SY_COMMA_IN_REG
267# define RTASM_ARM_DMB_SY "dmb sy\n\t"
268# define RTASM_ARM_DMB_SY_IN_REG
269# define RTASM_ARM_DMB_SY_COMMA_IN_REG
270# define RTASM_ARM_DMB_ST "dmb st\n\t"
271# define RTASM_ARM_DMB_ST_IN_REG
272# define RTASM_ARM_DMB_ST_COMMA_IN_REG
273# define RTASM_ARM_DMB_LD "dmb ld\n\t"
274# define RTASM_ARM_DMB_LD_IN_REG
275# define RTASM_ARM_DMB_LD_COMMA_IN_REG
276# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
277# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
278 uint32_t rcSpill; \
279 uint32_t u32NewRet; \
280 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
281 RTASM_ARM_##barrier_type /* before lable? */ \
282 "ldaxr %w[uNew], %[pMem]\n\t" \
283 modify64 \
284 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
285 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
286 : [pMem] "+Q" (*a_pu32Mem) \
287 , [uNew] "=&r" (u32NewRet) \
288 , [rc] "=&r" (rcSpill) \
289 : in_reg \
290 : "cc")
291# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
292 uint32_t rcSpill; \
293 uint32_t u32OldRet; \
294 uint32_t u32NewSpill; \
295 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
296 RTASM_ARM_##barrier_type /* before lable? */ \
297 "ldaxr %w[uOld], %[pMem]\n\t" \
298 modify64 \
299 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
300 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
301 : [pMem] "+Q" (*a_pu32Mem) \
302 , [uOld] "=&r" (u32OldRet) \
303 , [uNew] "=&r" (u32NewSpill) \
304 , [rc] "=&r" (rcSpill) \
305 : in_reg \
306 : "cc")
307# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
308 uint32_t rcSpill; \
309 uint64_t u64NewRet; \
310 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
311 RTASM_ARM_##barrier_type /* before lable? */ \
312 "ldaxr %[uNew], %[pMem]\n\t" \
313 modify64 \
314 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
315 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
316 : [pMem] "+Q" (*a_pu64Mem) \
317 , [uNew] "=&r" (u64NewRet) \
318 , [rc] "=&r" (rcSpill) \
319 : in_reg \
320 : "cc")
321# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
322 uint32_t rcSpill; \
323 uint64_t u64OldRet; \
324 uint64_t u64NewSpill; \
325 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
326 RTASM_ARM_##barrier_type /* before lable? */ \
327 "ldaxr %[uOld], %[pMem]\n\t" \
328 modify64 \
329 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
330 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
331 : [pMem] "+Q" (*a_pu64Mem) \
332 , [uOld] "=&r" (u64OldRet) \
333 , [uNew] "=&r" (u64NewSpill) \
334 , [rc] "=&r" (rcSpill) \
335 : in_reg \
336 : "cc")
337
338# else /* RT_ARCH_ARM32 */
339# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
340# if RT_ARCH_ARM32 >= 7
341# warning armv7
342# define RTASM_ARM_NO_BARRIER_IN_REG
343# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
344# define RTASM_ARM_DSB_SY "dsb sy\n\t"
345# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
346# define RTASM_ARM_DMB_SY "dmb sy\n\t"
347# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
348# define RTASM_ARM_DMB_ST "dmb st\n\t"
349# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
350# define RTASM_ARM_DMB_LD "dmb ld\n\t"
351# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
352
353# elif RT_ARCH_ARM32 >= 6
354# warning armv6
355# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
356# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
357# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
358# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
359# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
360# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
361# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
362# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
363
364# elif RT_ARCH_ARM32 >= 4
365# warning armv5 or older
366# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
367# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
368# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
369# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
370# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
371# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
372# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
373# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
374# else
375# error "huh? Odd RT_ARCH_ARM32 value!"
376# endif
377# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
378# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
379# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
380# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
381# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
382 uint32_t rcSpill; \
383 uint32_t u32NewRet; \
384 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
385 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
386 "ldrex %[uNew], %[pMem]\n\t" \
387 modify32 \
388 "strex %[rc], %[uNew], %[pMem]\n\t" \
389 "cmp %[rc], #0\n\t" \
390 "bne Ltry_again_" #name "_%=\n\t" \
391 : [pMem] "+m" (*a_pu32Mem) \
392 , [uNew] "=&r" (u32NewRet) \
393 , [rc] "=&r" (rcSpill) \
394 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
395 , in_reg \
396 : "cc")
397# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
398 uint32_t rcSpill; \
399 uint32_t u32OldRet; \
400 uint32_t u32NewSpill; \
401 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
402 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
403 "ldrex %[uOld], %[pMem]\n\t" \
404 modify32 \
405 "strex %[rc], %[uNew], %[pMem]\n\t" \
406 "cmp %[rc], #0\n\t" \
407 "bne Ltry_again_" #name "_%=\n\t" \
408 : [pMem] "+m" (*a_pu32Mem) \
409 , [uOld] "=&r" (u32OldRet) \
410 , [uNew] "=&r" (u32NewSpill) \
411 , [rc] "=&r" (rcSpill) \
412 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
413 , in_reg \
414 : "cc")
415# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
416 uint32_t rcSpill; \
417 uint64_t u64NewRet; \
418 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
419 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
420 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
421 modify32 \
422 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
423 "cmp %[rc], #0\n\t" \
424 "bne Ltry_again_" #name "_%=\n\t" \
425 : [pMem] "+m" (*a_pu64Mem), \
426 [uNew] "=&r" (u64NewRet), \
427 [rc] "=&r" (rcSpill) \
428 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
429 , in_reg \
430 : "cc")
431# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
432 uint32_t rcSpill; \
433 uint64_t u64OldRet; \
434 uint64_t u64NewSpill; \
435 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
436 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
437 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
438 modify32 \
439 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
440 "cmp %[rc], #0\n\t" \
441 "bne Ltry_again_" #name "_%=\n\t" \
442 : [pMem] "+m" (*a_pu64Mem), \
443 [uOld] "=&r" (u64OldRet), \
444 [uNew] "=&r" (u64NewSpill), \
445 [rc] "=&r" (rcSpill) \
446 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
447 , in_reg \
448 : "cc")
449# endif /* RT_ARCH_ARM32 */
450#endif
451
452
453/** @def ASMReturnAddress
454 * Gets the return address of the current (or calling if you like) function or method.
455 */
456#ifdef _MSC_VER
457# ifdef __cplusplus
458extern "C"
459# endif
460void * _ReturnAddress(void);
461# pragma intrinsic(_ReturnAddress)
462# define ASMReturnAddress() _ReturnAddress()
463#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
464# define ASMReturnAddress() __builtin_return_address(0)
465#elif defined(__WATCOMC__)
466# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
467#else
468# error "Unsupported compiler."
469#endif
470
471
472/**
473 * Compiler memory barrier.
474 *
475 * Ensure that the compiler does not use any cached (register/tmp stack) memory
476 * values or any outstanding writes when returning from this function.
477 *
478 * This function must be used if non-volatile data is modified by a
479 * device or the VMM. Typical cases are port access, MMIO access,
480 * trapping instruction, etc.
481 */
482#if RT_INLINE_ASM_GNU_STYLE
483# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
484#elif RT_INLINE_ASM_USES_INTRIN
485# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
486#elif defined(__WATCOMC__)
487void ASMCompilerBarrier(void);
488#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
489DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
490{
491 __asm
492 {
493 }
494}
495#endif
496
497
498/** @def ASMBreakpoint
499 * Debugger Breakpoint.
500 * @deprecated Use RT_BREAKPOINT instead.
501 * @internal
502 */
503#define ASMBreakpoint() RT_BREAKPOINT()
504
505
506/**
507 * Spinloop hint for platforms that have these, empty function on the other
508 * platforms.
509 *
510 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
511 * spin locks.
512 */
513#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
514RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
515#else
516DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
517{
518# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
519# if RT_INLINE_ASM_GNU_STYLE
520 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
521# else
522 __asm {
523 _emit 0f3h
524 _emit 090h
525 }
526# endif
527
528# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
529# if RT_INLINE_ASM_USES_INTRIN
530 __yield();
531# else
532 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
533# endif
534
535# else
536 /* dummy */
537# endif
538}
539#endif
540
541
542/**
543 * Atomically Exchange an unsigned 8-bit value, ordered.
544 *
545 * @returns Current *pu8 value
546 * @param pu8 Pointer to the 8-bit variable to update.
547 * @param u8 The 8-bit value to assign to *pu8.
548 */
549#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
550RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
551#else
552DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
553{
554# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
555# if RT_INLINE_ASM_GNU_STYLE
556 __asm__ __volatile__("xchgb %0, %1\n\t"
557 : "=m" (*pu8)
558 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
559 : "1" (u8)
560 , "m" (*pu8));
561# else
562 __asm
563 {
564# ifdef RT_ARCH_AMD64
565 mov rdx, [pu8]
566 mov al, [u8]
567 xchg [rdx], al
568 mov [u8], al
569# else
570 mov edx, [pu8]
571 mov al, [u8]
572 xchg [edx], al
573 mov [u8], al
574# endif
575 }
576# endif
577 return u8;
578
579# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
580# if RT_INLINE_ASM_USES_INTRIN
581# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
582 return __swpal8(pu8, u8);
583# else
584 uint8_t uOld = __swp8(pu8, u8);
585 __dmb(_ARM64_BARRIER_SY);
586 return uOld;
587# endif
588
589# else
590 uint32_t uOld;
591# if defined(RTASM_ARM64_USE_FEAT_LSE)
592 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
593 have the barrier we shouldn't need that, right? Ordering should be taken
594 care of by the DMB. The SWPB is rather cheap (~70% faster). */
595 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
596# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
597 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
598# else
599 RTASM_ARM_DMB_SY
600 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
601# endif
602 : [pMem] "+Q" (*pu8)
603 , [uOld] "=&r" (uOld)
604 : [uNew] "r" ((uint32_t)u8)
605 : );
606# else
607 uint32_t rcSpill;
608 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
609 RTASM_ARM_DMB_SY
610# if defined(RT_ARCH_ARM64)
611 "ldaxrb %w[uOld], %[pMem]\n\t"
612 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
613 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
614# else
615 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
616 "strexb %[rc], %[uNew], %[pMem]\n\t"
617 "cmp %[rc], #0\n\t"
618 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
619# endif
620 : [pMem] "+Q" (*pu8)
621 , [uOld] "=&r" (uOld)
622 , [rc] "=&r" (rcSpill)
623 : [uNew] "r" ((uint32_t)u8)
624 RTASM_ARM_DMB_SY_COMMA_IN_REG
625 : "cc");
626# endif
627 return (uint8_t)uOld;
628# endif
629
630# else
631# error "Port me"
632# endif
633}
634#endif
635
636
637/**
638 * Atomically Exchange a signed 8-bit value, ordered.
639 *
640 * @returns Current *pu8 value
641 * @param pi8 Pointer to the 8-bit variable to update.
642 * @param i8 The 8-bit value to assign to *pi8.
643 */
644DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
645{
646 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
647}
648
649
650/**
651 * Atomically Exchange a bool value, ordered.
652 *
653 * @returns Current *pf value
654 * @param pf Pointer to the 8-bit variable to update.
655 * @param f The 8-bit value to assign to *pi8.
656 */
657DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
658{
659#ifdef _MSC_VER
660 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
661#else
662 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
663#endif
664}
665
666
667/**
668 * Atomically Exchange an unsigned 16-bit value, ordered.
669 *
670 * @returns Current *pu16 value
671 * @param pu16 Pointer to the 16-bit variable to update.
672 * @param u16 The 16-bit value to assign to *pu16.
673 */
674#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
675RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
676#else
677DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
678{
679# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
680# if RT_INLINE_ASM_GNU_STYLE
681 __asm__ __volatile__("xchgw %0, %1\n\t"
682 : "=m" (*pu16)
683 , "=r" (u16)
684 : "1" (u16)
685 , "m" (*pu16));
686# else
687 __asm
688 {
689# ifdef RT_ARCH_AMD64
690 mov rdx, [pu16]
691 mov ax, [u16]
692 xchg [rdx], ax
693 mov [u16], ax
694# else
695 mov edx, [pu16]
696 mov ax, [u16]
697 xchg [edx], ax
698 mov [u16], ax
699# endif
700 }
701# endif
702 return u16;
703
704# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
705# if RT_INLINE_ASM_USES_INTRIN
706# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
707 return __swpal16(pu16, u16);
708# else
709 uint16_t uOld = __swp16(pu16, u16);
710 __dmb(_ARM64_BARRIER_SY);
711 return uOld;
712# endif
713
714# else
715 uint32_t uOld;
716# if defined(RTASM_ARM64_USE_FEAT_LSE)
717 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
718 slower if we remove the barrier. But since we have the barrier we
719 shouldn't need that, right? Ordering should be taken care of by the DMB.
720 The SWPH is rather cheap (~70% faster). */
721 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
722# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
723 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
724# else
725 RTASM_ARM_DMB_SY
726 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
727# endif
728 : [pMem] "+Q" (*pu16)
729 , [uOld] "=&r" (uOld)
730 : [uNew] "r" ((uint32_t)u16)
731 : );
732# else
733 uint32_t rcSpill;
734 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
735 RTASM_ARM_DMB_SY
736# if defined(RT_ARCH_ARM64)
737 "ldaxrh %w[uOld], %[pMem]\n\t"
738 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
739 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
740# else
741 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
742 "strexh %[rc], %[uNew], %[pMem]\n\t"
743 "cmp %[rc], #0\n\t"
744 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
745# endif
746 : [pMem] "+Q" (*pu16)
747 , [uOld] "=&r" (uOld)
748 , [rc] "=&r" (rcSpill)
749 : [uNew] "r" ((uint32_t)u16)
750 RTASM_ARM_DMB_SY_COMMA_IN_REG
751 : "cc");
752# endif
753 return (uint16_t)uOld;
754# endif
755
756# else
757# error "Port me"
758# endif
759}
760#endif
761
762
763/**
764 * Atomically Exchange a signed 16-bit value, ordered.
765 *
766 * @returns Current *pu16 value
767 * @param pi16 Pointer to the 16-bit variable to update.
768 * @param i16 The 16-bit value to assign to *pi16.
769 */
770DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
771{
772 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
773}
774
775
776/**
777 * Atomically Exchange an unsigned 32-bit value, ordered.
778 *
779 * @returns Current *pu32 value
780 * @param pu32 Pointer to the 32-bit variable to update.
781 * @param u32 The 32-bit value to assign to *pu32.
782 *
783 * @remarks Does not work on 286 and earlier.
784 */
785#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
786RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
787#else
788DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
789{
790# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
791# if RT_INLINE_ASM_GNU_STYLE
792 __asm__ __volatile__("xchgl %0, %1\n\t"
793 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
794 , "=r" (u32)
795 : "1" (u32)
796 , "m" (*pu32));
797
798# elif RT_INLINE_ASM_USES_INTRIN
799 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
800
801# else
802 __asm
803 {
804# ifdef RT_ARCH_AMD64
805 mov rdx, [pu32]
806 mov eax, u32
807 xchg [rdx], eax
808 mov [u32], eax
809# else
810 mov edx, [pu32]
811 mov eax, u32
812 xchg [edx], eax
813 mov [u32], eax
814# endif
815 }
816# endif
817 return u32;
818
819# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
820
821# if RT_INLINE_ASM_USES_INTRIN
822# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
823 return __swpal32(pu32, u32);
824# else
825 uint32_t uOld = __swp32(pu32, u32);
826 __dmb(_ARM64_BARRIER_SY);
827 return uOld;
828# endif
829
830# else
831 uint32_t uOld;
832# if defined(RTASM_ARM64_USE_FEAT_LSE)
833 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
834 slower if we remove the barrier. But since we have the barrier we
835 shouldn't need that, right? Ordering should be taken care of by the DMB.
836 The SWP is rather cheap (~70% faster). */
837 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
838# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
839 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
840# else
841 RTASM_ARM_DMB_SY
842 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
843# endif
844 : [pMem] "+Q" (*pu32)
845 , [uOld] "=&r" (uOld)
846 : [uNew] "r" (u32)
847 : );
848# else
849 uint32_t rcSpill;
850 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
851 RTASM_ARM_DMB_SY
852# if defined(RT_ARCH_ARM64)
853 "ldaxr %w[uOld], %[pMem]\n\t"
854 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
855 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
856# else
857 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
858 "strex %[rc], %[uNew], %[pMem]\n\t"
859 "cmp %[rc], #0\n\t"
860 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
861# endif
862 : [pMem] "+Q" (*pu32)
863 , [uOld] "=&r" (uOld)
864 , [rc] "=&r" (rcSpill)
865 : [uNew] "r" (u32)
866 RTASM_ARM_DMB_SY_COMMA_IN_REG
867 : "cc");
868# endif
869 return uOld;
870# endif
871
872# else
873# error "Port me"
874# endif
875}
876#endif
877
878
879/**
880 * Atomically Exchange a signed 32-bit value, ordered.
881 *
882 * @returns Current *pu32 value
883 * @param pi32 Pointer to the 32-bit variable to update.
884 * @param i32 The 32-bit value to assign to *pi32.
885 */
886DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
887{
888 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
889}
890
891
892/**
893 * Atomically Exchange an unsigned 64-bit value, ordered.
894 *
895 * @returns Current *pu64 value
896 * @param pu64 Pointer to the 64-bit variable to update.
897 * @param u64 The 64-bit value to assign to *pu64.
898 *
899 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
900 */
901#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
902 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
903RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
904#else
905DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
906{
907# if defined(RT_ARCH_AMD64)
908# if RT_INLINE_ASM_USES_INTRIN
909 return _InterlockedExchange64((__int64 *)pu64, u64);
910
911# elif RT_INLINE_ASM_GNU_STYLE
912 __asm__ __volatile__("xchgq %0, %1\n\t"
913 : "=m" (*pu64)
914 , "=r" (u64)
915 : "1" (u64)
916 , "m" (*pu64));
917 return u64;
918# else
919 __asm
920 {
921 mov rdx, [pu64]
922 mov rax, [u64]
923 xchg [rdx], rax
924 mov [u64], rax
925 }
926 return u64;
927# endif
928
929# elif defined(RT_ARCH_X86)
930# if RT_INLINE_ASM_GNU_STYLE
931# if defined(PIC) || defined(__PIC__)
932 uint32_t u32EBX = (uint32_t)u64;
933 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
934 "xchgl %%ebx, %3\n\t"
935 "1:\n\t"
936 "lock; cmpxchg8b (%5)\n\t"
937 "jnz 1b\n\t"
938 "movl %3, %%ebx\n\t"
939 /*"xchgl %%esi, %5\n\t"*/
940 : "=A" (u64)
941 , "=m" (*pu64)
942 : "0" (*pu64)
943 , "m" ( u32EBX )
944 , "c" ( (uint32_t)(u64 >> 32) )
945 , "S" (pu64)
946 : "cc");
947# else /* !PIC */
948 __asm__ __volatile__("1:\n\t"
949 "lock; cmpxchg8b %1\n\t"
950 "jnz 1b\n\t"
951 : "=A" (u64)
952 , "=m" (*pu64)
953 : "0" (*pu64)
954 , "b" ( (uint32_t)u64 )
955 , "c" ( (uint32_t)(u64 >> 32) )
956 : "cc");
957# endif
958# else
959 __asm
960 {
961 mov ebx, dword ptr [u64]
962 mov ecx, dword ptr [u64 + 4]
963 mov edi, pu64
964 mov eax, dword ptr [edi]
965 mov edx, dword ptr [edi + 4]
966 retry:
967 lock cmpxchg8b [edi]
968 jnz retry
969 mov dword ptr [u64], eax
970 mov dword ptr [u64 + 4], edx
971 }
972# endif
973 return u64;
974
975# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
976# if RT_INLINE_ASM_USES_INTRIN
977# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
978 return __swpal64(pu64, u64);
979# else
980 uint64_t uOld = __swp64(pu64, u64);
981 __dmb(_ARM64_BARRIER_SY);
982 return uOld;
983# endif
984
985# else
986 uint64_t uOld;
987# if defined(RTASM_ARM64_USE_FEAT_LSE)
988 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
989 slower if we remove the barrier. But since we have the barrier we
990 shouldn't need that, right? Ordering should be taken care of by the DMB.
991 The SWP is rather cheap (~70% faster). */
992 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
993# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
994 "swpal %[uNew], %[uOld], %[pMem]\n\t"
995# else
996 RTASM_ARM_DMB_SY
997 "swp %[uNew], %[uOld], %[pMem]\n\t"
998# endif
999 : [pMem] "+Q" (*pu64)
1000 , [uOld] "=&r" (uOld)
1001 : [uNew] "r" (u64)
1002 : );
1003# else
1004 uint32_t rcSpill;
1005 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
1006 RTASM_ARM_DMB_SY
1007# if defined(RT_ARCH_ARM64)
1008 "ldaxr %[uOld], %[pMem]\n\t"
1009 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1010 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
1011# else
1012 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
1013 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1014 "cmp %[rc], #0\n\t"
1015 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
1016# endif
1017 : [pMem] "+Q" (*pu64)
1018 , [uOld] "=&r" (uOld)
1019 , [rc] "=&r" (rcSpill)
1020 : [uNew] "r" (u64)
1021 RTASM_ARM_DMB_SY_COMMA_IN_REG
1022 : "cc");
1023# endif
1024 return uOld;
1025# endif
1026
1027# else
1028# error "Port me"
1029# endif
1030}
1031#endif
1032
1033
1034/**
1035 * Atomically Exchange an signed 64-bit value, ordered.
1036 *
1037 * @returns Current *pi64 value
1038 * @param pi64 Pointer to the 64-bit variable to update.
1039 * @param i64 The 64-bit value to assign to *pi64.
1040 */
1041DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
1042{
1043 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
1044}
1045
1046
1047/**
1048 * Atomically Exchange a size_t value, ordered.
1049 *
1050 * @returns Current *ppv value
1051 * @param puDst Pointer to the size_t variable to update.
1052 * @param uNew The new value to assign to *puDst.
1053 */
1054DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
1055{
1056#if ARCH_BITS == 16
1057 AssertCompile(sizeof(size_t) == 2);
1058 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
1059#elif ARCH_BITS == 32
1060 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
1061#elif ARCH_BITS == 64
1062 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
1063#else
1064# error "ARCH_BITS is bogus"
1065#endif
1066}
1067
1068
1069/**
1070 * Atomically Exchange a pointer value, ordered.
1071 *
1072 * @returns Current *ppv value
1073 * @param ppv Pointer to the pointer variable to update.
1074 * @param pv The pointer value to assign to *ppv.
1075 */
1076DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
1077{
1078#if ARCH_BITS == 32 || ARCH_BITS == 16
1079 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
1080#elif ARCH_BITS == 64
1081 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
1082#else
1083# error "ARCH_BITS is bogus"
1084#endif
1085}
1086
1087
1088/**
1089 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
1090 *
1091 * @returns Current *pv value
1092 * @param ppv Pointer to the pointer variable to update.
1093 * @param pv The pointer value to assign to *ppv.
1094 * @param Type The type of *ppv, sans volatile.
1095 */
1096#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
1097# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1098 __extension__ \
1099 ({\
1100 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1101 Type const pvTypeChecked = (pv); \
1102 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1103 pvTypeCheckedRet; \
1104 })
1105#else
1106# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1107 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1108#endif
1109
1110
1111/**
1112 * Atomically Exchange a raw-mode context pointer value, ordered.
1113 *
1114 * @returns Current *ppv value
1115 * @param ppvRC Pointer to the pointer variable to update.
1116 * @param pvRC The pointer value to assign to *ppv.
1117 */
1118DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1119{
1120 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1121}
1122
1123
1124/**
1125 * Atomically Exchange a ring-0 pointer value, ordered.
1126 *
1127 * @returns Current *ppv value
1128 * @param ppvR0 Pointer to the pointer variable to update.
1129 * @param pvR0 The pointer value to assign to *ppv.
1130 */
1131DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1132{
1133#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1134 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1135#elif R0_ARCH_BITS == 64
1136 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1137#else
1138# error "R0_ARCH_BITS is bogus"
1139#endif
1140}
1141
1142
1143/**
1144 * Atomically Exchange a ring-3 pointer value, ordered.
1145 *
1146 * @returns Current *ppv value
1147 * @param ppvR3 Pointer to the pointer variable to update.
1148 * @param pvR3 The pointer value to assign to *ppv.
1149 */
1150DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1151{
1152#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1153 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1154#elif R3_ARCH_BITS == 64
1155 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1156#else
1157# error "R3_ARCH_BITS is bogus"
1158#endif
1159}
1160
1161
1162/** @def ASMAtomicXchgHandle
1163 * Atomically Exchange a typical IPRT handle value, ordered.
1164 *
1165 * @param ph Pointer to the value to update.
1166 * @param hNew The new value to assigned to *pu.
1167 * @param phRes Where to store the current *ph value.
1168 *
1169 * @remarks This doesn't currently work for all handles (like RTFILE).
1170 */
1171#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1172# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1173 do { \
1174 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1175 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1176 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1177 } while (0)
1178#elif HC_ARCH_BITS == 64
1179# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1180 do { \
1181 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1182 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1183 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1184 } while (0)
1185#else
1186# error HC_ARCH_BITS
1187#endif
1188
1189
1190/**
1191 * Atomically Exchange a value which size might differ
1192 * between platforms or compilers, ordered.
1193 *
1194 * @param pu Pointer to the variable to update.
1195 * @param uNew The value to assign to *pu.
1196 * @todo This is busted as its missing the result argument.
1197 */
1198#define ASMAtomicXchgSize(pu, uNew) \
1199 do { \
1200 switch (sizeof(*(pu))) { \
1201 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1202 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1203 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1204 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1205 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1206 } \
1207 } while (0)
1208
1209/**
1210 * Atomically Exchange a value which size might differ
1211 * between platforms or compilers, ordered.
1212 *
1213 * @param pu Pointer to the variable to update.
1214 * @param uNew The value to assign to *pu.
1215 * @param puRes Where to store the current *pu value.
1216 */
1217#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1218 do { \
1219 switch (sizeof(*(pu))) { \
1220 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1221 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1222 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1223 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1224 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1225 } \
1226 } while (0)
1227
1228
1229
1230/**
1231 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1232 *
1233 * @returns true if xchg was done.
1234 * @returns false if xchg wasn't done.
1235 *
1236 * @param pu8 Pointer to the value to update.
1237 * @param u8New The new value to assigned to *pu8.
1238 * @param u8Old The old value to *pu8 compare with.
1239 *
1240 * @remarks x86: Requires a 486 or later.
1241 * @todo Rename ASMAtomicCmpWriteU8
1242 */
1243#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || (!RT_INLINE_ASM_GNU_STYLE && !defined(RT_ARCH_ARM64) && !defined(RT_ARCH_ARM32))
1244RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1245#else
1246DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1247{
1248# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1249 uint8_t u8Ret;
1250 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1251 "setz %1\n\t"
1252 : "=m" (*pu8)
1253 , "=qm" (u8Ret)
1254 , "=a" (u8Old)
1255 : "q" (u8New)
1256 , "2" (u8Old)
1257 , "m" (*pu8)
1258 : "cc");
1259 return (bool)u8Ret;
1260
1261# elif RT_INLINE_ASM_USES_INTRIN
1262 return (uint8_t)_InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old) == u8Old;
1263
1264# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1265 union { uint32_t u; bool f; } fXchg;
1266 uint32_t u32Spill;
1267# if defined(RTASM_ARM64_USE_FEAT_LSE)
1268 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1269# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1270 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1271# else
1272 RTASM_ARM_DMB_SY
1273 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1274# endif
1275 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1276 "cset %w[fXchg], eq\n\t"
1277 : [pMem] "+Q" (*pu8)
1278 , [uOldActual] "=&r" (u32Spill)
1279 , [fXchg] "=&r" (fXchg.u)
1280 : [uNew] "r" ((uint32_t)u8New)
1281 , [uOldOrg] "r" ((uint32_t)u8Old)
1282 , "[uOldActual]" ((uint32_t)u8Old)
1283 : "cc");
1284# else
1285 uint32_t rcSpill;
1286 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1287 RTASM_ARM_DMB_SY
1288# if defined(RT_ARCH_ARM64)
1289 "ldaxrb %w[uOld], %[pMem]\n\t"
1290 "cmp %w[uOld], %w[uCmp]\n\t"
1291 "bne 1f\n\t" /* stop here if not equal */
1292 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1293 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1294 "mov %w[fXchg], #1\n\t"
1295 "1:\n\t"
1296 "clrex\n\t"
1297# else
1298 "ldrexb %[uOld], %[pMem]\n\t"
1299 "teq %[uOld], %[uCmp]\n\t"
1300 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1301 "bne 1f\n\t" /* stop here if not equal */
1302 "cmp %[rc], #0\n\t"
1303 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1304 "mov %[fXchg], #1\n\t"
1305 "1:\n\t"
1306 /** @todo clrexne on armv7? */
1307# endif
1308 : [pMem] "+Q" (*pu8)
1309 , [uOld] "=&r" (u32Spill)
1310 , [rc] "=&r" (rcSpill)
1311 , [fXchg] "=&r" (fXchg.u)
1312 : [uCmp] "r" ((uint32_t)u8Old)
1313 , [uNew] "r" ((uint32_t)u8New)
1314 , "[fXchg]" (0)
1315 RTASM_ARM_DMB_SY_COMMA_IN_REG
1316 : "cc");
1317# endif
1318 return fXchg.f;
1319
1320# else
1321# error "Port me"
1322# endif
1323}
1324#endif
1325
1326
1327/**
1328 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1329 *
1330 * @returns true if xchg was done.
1331 * @returns false if xchg wasn't done.
1332 *
1333 * @param pi8 Pointer to the value to update.
1334 * @param i8New The new value to assigned to *pi8.
1335 * @param i8Old The old value to *pi8 compare with.
1336 *
1337 * @remarks x86: Requires a 486 or later.
1338 * @todo Rename ASMAtomicCmpWriteS8
1339 */
1340DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1341{
1342 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1343}
1344
1345
1346/**
1347 * Atomically Compare and Exchange a bool value, ordered.
1348 *
1349 * @returns true if xchg was done.
1350 * @returns false if xchg wasn't done.
1351 *
1352 * @param pf Pointer to the value to update.
1353 * @param fNew The new value to assigned to *pf.
1354 * @param fOld The old value to *pf compare with.
1355 *
1356 * @remarks x86: Requires a 486 or later.
1357 * @todo Rename ASMAtomicCmpWriteBool
1358 */
1359DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1360{
1361 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1362}
1363
1364
1365/**
1366 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1367 *
1368 * @returns true if xchg was done.
1369 * @returns false if xchg wasn't done.
1370 *
1371 * @param pu32 Pointer to the value to update.
1372 * @param u32New The new value to assigned to *pu32.
1373 * @param u32Old The old value to *pu32 compare with.
1374 *
1375 * @remarks x86: Requires a 486 or later.
1376 * @todo Rename ASMAtomicCmpWriteU32
1377 */
1378#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1379RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1380#else
1381DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1382{
1383# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1384# if RT_INLINE_ASM_GNU_STYLE
1385 uint8_t u8Ret;
1386 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1387 "setz %1\n\t"
1388 : "=m" (*pu32)
1389 , "=qm" (u8Ret)
1390 , "=a" (u32Old)
1391 : "r" (u32New)
1392 , "2" (u32Old)
1393 , "m" (*pu32)
1394 : "cc");
1395 return (bool)u8Ret;
1396
1397# elif RT_INLINE_ASM_USES_INTRIN
1398 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1399
1400# else
1401 uint32_t u32Ret;
1402 __asm
1403 {
1404# ifdef RT_ARCH_AMD64
1405 mov rdx, [pu32]
1406# else
1407 mov edx, [pu32]
1408# endif
1409 mov eax, [u32Old]
1410 mov ecx, [u32New]
1411# ifdef RT_ARCH_AMD64
1412 lock cmpxchg [rdx], ecx
1413# else
1414 lock cmpxchg [edx], ecx
1415# endif
1416 setz al
1417 movzx eax, al
1418 mov [u32Ret], eax
1419 }
1420 return !!u32Ret;
1421# endif
1422
1423# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1424# if RT_INLINE_ASM_USES_INTRIN
1425# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1426 uint32_t const uOldActual = __casal32(pu32, u32Old, u32New);
1427# else
1428 __dmb(_ARM64_BARRIER_SY);
1429 uint32_t const uOldActual = __cas32(pu32, u32Old, u32New);
1430# endif
1431 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1432
1433# else
1434 union { uint32_t u; bool f; } fXchg;
1435 uint32_t u32Spill;
1436 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1437 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1438# if defined(RTASM_ARM64_USE_FEAT_LSE)
1439 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1440# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1441 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1442# else
1443 RTASM_ARM_DMB_SY
1444 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1445# endif
1446 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1447 "cset %w[fXchg], eq\n\t"
1448 : [pMem] "+Q" (*pu32)
1449 , [uOldActual] "=&r" (u32Spill)
1450 , [fXchg] "=&r" (fXchg.u)
1451 : [uNew] "r" (u32New)
1452 , [uOldOrg] "r" (u32Old)
1453 , "[uOldActual]" (u32Old)
1454 : "cc");
1455# else
1456 uint32_t rcSpill;
1457 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1458 RTASM_ARM_DMB_SY
1459# if defined(RT_ARCH_ARM64)
1460 "ldaxr %w[uOld], %[pMem]\n\t"
1461 "cmp %w[uOld], %w[uCmp]\n\t"
1462 "bne 1f\n\t" /* stop here if not equal */
1463 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1464 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1465 "mov %w[fXchg], #1\n\t"
1466 "1:\n\t"
1467 "clrex\n\t"
1468# else
1469 "ldrex %[uOld], %[pMem]\n\t"
1470 "teq %[uOld], %[uCmp]\n\t"
1471 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1472 "bne 1f\n\t" /* stop here if not equal */
1473 "cmp %[rc], #0\n\t"
1474 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1475 "mov %[fXchg], #1\n\t"
1476 "1:\n\t"
1477 /** @todo clrexne on armv7? */
1478# endif
1479 : [pMem] "+Q" (*pu32)
1480 , [uOld] "=&r" (u32Spill)
1481 , [rc] "=&r" (rcSpill)
1482 , [fXchg] "=&r" (fXchg.u)
1483 : [uCmp] "r" (u32Old)
1484 , [uNew] "r" (u32New)
1485 , "[fXchg]" (0)
1486 RTASM_ARM_DMB_SY_COMMA_IN_REG
1487 : "cc");
1488# endif
1489 return fXchg.f;
1490# endif
1491
1492# else
1493# error "Port me"
1494# endif
1495}
1496#endif
1497
1498
1499/**
1500 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1501 *
1502 * @returns true if xchg was done.
1503 * @returns false if xchg wasn't done.
1504 *
1505 * @param pi32 Pointer to the value to update.
1506 * @param i32New The new value to assigned to *pi32.
1507 * @param i32Old The old value to *pi32 compare with.
1508 *
1509 * @remarks x86: Requires a 486 or later.
1510 * @todo Rename ASMAtomicCmpWriteS32
1511 */
1512DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1513{
1514 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1515}
1516
1517
1518/**
1519 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1520 *
1521 * @returns true if xchg was done.
1522 * @returns false if xchg wasn't done.
1523 *
1524 * @param pu64 Pointer to the 64-bit variable to update.
1525 * @param u64New The 64-bit value to assign to *pu64.
1526 * @param u64Old The value to compare with.
1527 *
1528 * @remarks x86: Requires a Pentium or later.
1529 * @todo Rename ASMAtomicCmpWriteU64
1530 */
1531#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1532 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1533RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1534#else
1535DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1536{
1537# if RT_INLINE_ASM_USES_INTRIN
1538 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1539
1540# elif defined(RT_ARCH_AMD64)
1541# if RT_INLINE_ASM_GNU_STYLE
1542 uint8_t u8Ret;
1543 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1544 "setz %1\n\t"
1545 : "=m" (*pu64)
1546 , "=qm" (u8Ret)
1547 , "=a" (u64Old)
1548 : "r" (u64New)
1549 , "2" (u64Old)
1550 , "m" (*pu64)
1551 : "cc");
1552 return (bool)u8Ret;
1553# else
1554 bool fRet;
1555 __asm
1556 {
1557 mov rdx, [pu32]
1558 mov rax, [u64Old]
1559 mov rcx, [u64New]
1560 lock cmpxchg [rdx], rcx
1561 setz al
1562 mov [fRet], al
1563 }
1564 return fRet;
1565# endif
1566
1567# elif defined(RT_ARCH_X86)
1568 uint32_t u32Ret;
1569# if RT_INLINE_ASM_GNU_STYLE
1570# if defined(PIC) || defined(__PIC__)
1571 uint32_t u32EBX = (uint32_t)u64New;
1572 uint32_t u32Spill;
1573 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1574 "lock; cmpxchg8b (%6)\n\t"
1575 "setz %%al\n\t"
1576 "movl %4, %%ebx\n\t"
1577 "movzbl %%al, %%eax\n\t"
1578 : "=a" (u32Ret)
1579 , "=d" (u32Spill)
1580# if RT_GNUC_PREREQ(4, 3)
1581 , "+m" (*pu64)
1582# else
1583 , "=m" (*pu64)
1584# endif
1585 : "A" (u64Old)
1586 , "m" ( u32EBX )
1587 , "c" ( (uint32_t)(u64New >> 32) )
1588 , "S" (pu64)
1589 : "cc");
1590# else /* !PIC */
1591 uint32_t u32Spill;
1592 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1593 "setz %%al\n\t"
1594 "movzbl %%al, %%eax\n\t"
1595 : "=a" (u32Ret)
1596 , "=d" (u32Spill)
1597 , "+m" (*pu64)
1598 : "A" (u64Old)
1599 , "b" ( (uint32_t)u64New )
1600 , "c" ( (uint32_t)(u64New >> 32) )
1601 : "cc");
1602# endif
1603 return (bool)u32Ret;
1604# else
1605 __asm
1606 {
1607 mov ebx, dword ptr [u64New]
1608 mov ecx, dword ptr [u64New + 4]
1609 mov edi, [pu64]
1610 mov eax, dword ptr [u64Old]
1611 mov edx, dword ptr [u64Old + 4]
1612 lock cmpxchg8b [edi]
1613 setz al
1614 movzx eax, al
1615 mov dword ptr [u32Ret], eax
1616 }
1617 return !!u32Ret;
1618# endif
1619
1620# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1621# if RT_INLINE_ASM_USES_INTRIN
1622# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1623 uint64_t const uOldActual = __casal64(pu64, u64Old, u64New);
1624# else
1625 __dmb(_ARM64_BARRIER_SY);
1626 uint64_t const uOldActual = __cas64(pu64, u64Old, u64New);
1627# endif
1628 return uOldActual == u64Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1629
1630# else
1631 union { uint32_t u; bool f; } fXchg;
1632 uint64_t u64Spill;
1633 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1634 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1635# if defined(RTASM_ARM64_USE_FEAT_LSE)
1636 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1637# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1638 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1639# else
1640 RTASM_ARM_DMB_SY
1641 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1642# endif
1643 "cmp %[uOldActual], %[uOldOrg]\n\t"
1644 "cset %w[fXchg], eq\n\t"
1645 : [pMem] "+Q" (*pu64)
1646 , [uOldActual] "=&r" (u64Spill)
1647 , [fXchg] "=&r" (fXchg.u)
1648 : [uNew] "r" (u64New)
1649 , [uOldOrg] "r" (u64Old)
1650 , "[uOldActual]" (u64Old)
1651 : "cc");
1652# else
1653 uint32_t rcSpill;
1654 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1655 RTASM_ARM_DMB_SY
1656# if defined(RT_ARCH_ARM64)
1657 "ldaxr %[uOld], %[pMem]\n\t"
1658 "cmp %[uOld], %[uCmp]\n\t"
1659 "bne 1f\n\t" /* stop here if not equal */
1660 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1661 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1662 "mov %w[fXchg], #1\n\t"
1663 "1:\n\t"
1664 "clrex\n\t"
1665# else
1666 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1667 "teq %[uOld], %[uCmp]\n\t"
1668 "teqeq %H[uOld], %H[uCmp]\n\t"
1669 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1670 "bne 1f\n\t" /* stop here if not equal */
1671 "cmp %[rc], #0\n\t"
1672 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1673 "mov %[fXchg], #1\n\t"
1674 "1:\n\t"
1675 /** @todo clrexne on armv7? */
1676# endif
1677 : [pMem] "+Q" (*pu64)
1678 , [uOld] "=&r" (u64Spill)
1679 , [rc] "=&r" (rcSpill)
1680 , [fXchg] "=&r" (fXchg.u)
1681 : [uCmp] "r" (u64Old)
1682 , [uNew] "r" (u64New)
1683 , "[fXchg]" (0)
1684 RTASM_ARM_DMB_SY_COMMA_IN_REG
1685 : "cc");
1686# endif
1687 return fXchg.f;
1688# endif
1689
1690# else
1691# error "Port me"
1692# endif
1693}
1694#endif
1695
1696
1697/**
1698 * Atomically Compare and exchange a signed 64-bit value, ordered.
1699 *
1700 * @returns true if xchg was done.
1701 * @returns false if xchg wasn't done.
1702 *
1703 * @param pi64 Pointer to the 64-bit variable to update.
1704 * @param i64 The 64-bit value to assign to *pu64.
1705 * @param i64Old The value to compare with.
1706 *
1707 * @remarks x86: Requires a Pentium or later.
1708 * @todo Rename ASMAtomicCmpWriteS64
1709 */
1710DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1711{
1712 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1713}
1714
1715#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1716
1717/** @def RTASM_HAVE_CMP_WRITE_U128
1718 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1719 * and ASMAtomicCmpWriteExU128() available. */
1720# define RTASM_HAVE_CMP_WRITE_U128 1
1721
1722
1723/**
1724 * Atomically compare and write an unsigned 128-bit value, ordered.
1725 *
1726 * @returns true if write was done.
1727 * @returns false if write wasn't done.
1728 *
1729 * @param pu128 Pointer to the 128-bit variable to update.
1730 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1731 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1732 * @param u64OldHi The high 64-bit of the value to compare with.
1733 * @param u64OldLo The low 64-bit of the value to compare with.
1734 *
1735 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1736 */
1737# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1738DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1739 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1740# else
1741DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1742 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1743{
1744# if RT_INLINE_ASM_USES_INTRIN
1745 __int64 ai64Cmp[2];
1746 ai64Cmp[0] = u64OldLo;
1747 ai64Cmp[1] = u64OldHi;
1748 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1749
1750# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1751 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1752
1753# elif defined(RT_ARCH_AMD64)
1754# if RT_INLINE_ASM_GNU_STYLE
1755 uint64_t u64Ret;
1756 uint64_t u64Spill;
1757 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1758 "setz %%al\n\t"
1759 "movzbl %%al, %%eax\n\t"
1760 : "=a" (u64Ret)
1761 , "=d" (u64Spill)
1762 , "+m" (*pu128)
1763 : "a" (u64OldLo)
1764 , "d" (u64OldHi)
1765 , "b" (u64NewLo)
1766 , "c" (u64NewHi)
1767 : "cc");
1768
1769 return (bool)u64Ret;
1770# else
1771# error "Port me"
1772# endif
1773# else
1774# error "Port me"
1775# endif
1776}
1777# endif
1778
1779
1780/**
1781 * Atomically compare and write an unsigned 128-bit value, ordered.
1782 *
1783 * @returns true if write was done.
1784 * @returns false if write wasn't done.
1785 *
1786 * @param pu128 Pointer to the 128-bit variable to update.
1787 * @param u128New The 128-bit value to assign to *pu128.
1788 * @param u128Old The value to compare with.
1789 *
1790 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1791 */
1792DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1793{
1794# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1795# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1796 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1797# else
1798 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1799 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1800# endif
1801# else
1802 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1803# endif
1804}
1805
1806
1807/**
1808 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1809 */
1810DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1811 const RTUINT128U u128Old) RT_NOTHROW_DEF
1812{
1813# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1814 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1815# else
1816 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1817# endif
1818}
1819
1820#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1821
1822/**
1823 * Atomically Compare and Exchange a pointer value, ordered.
1824 *
1825 * @returns true if xchg was done.
1826 * @returns false if xchg wasn't done.
1827 *
1828 * @param ppv Pointer to the value to update.
1829 * @param pvNew The new value to assigned to *ppv.
1830 * @param pvOld The old value to *ppv compare with.
1831 *
1832 * @remarks x86: Requires a 486 or later.
1833 * @todo Rename ASMAtomicCmpWritePtrVoid
1834 */
1835DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1836{
1837#if ARCH_BITS == 32 || ARCH_BITS == 16
1838 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1839#elif ARCH_BITS == 64
1840 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1841#else
1842# error "ARCH_BITS is bogus"
1843#endif
1844}
1845
1846
1847/**
1848 * Atomically Compare and Exchange a pointer value, ordered.
1849 *
1850 * @returns true if xchg was done.
1851 * @returns false if xchg wasn't done.
1852 *
1853 * @param ppv Pointer to the value to update.
1854 * @param pvNew The new value to assigned to *ppv.
1855 * @param pvOld The old value to *ppv compare with.
1856 *
1857 * @remarks This is relatively type safe on GCC platforms.
1858 * @remarks x86: Requires a 486 or later.
1859 * @todo Rename ASMAtomicCmpWritePtr
1860 */
1861#ifdef __GNUC__
1862# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1863 __extension__ \
1864 ({\
1865 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1866 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1867 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1868 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1869 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1870 fMacroRet; \
1871 })
1872#else
1873# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1874 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1875#endif
1876
1877
1878/** @def ASMAtomicCmpXchgHandle
1879 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1880 *
1881 * @param ph Pointer to the value to update.
1882 * @param hNew The new value to assigned to *pu.
1883 * @param hOld The old value to *pu compare with.
1884 * @param fRc Where to store the result.
1885 *
1886 * @remarks This doesn't currently work for all handles (like RTFILE).
1887 * @remarks x86: Requires a 486 or later.
1888 * @todo Rename ASMAtomicCmpWriteHandle
1889 */
1890#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1891# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1892 do { \
1893 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1894 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1895 } while (0)
1896#elif HC_ARCH_BITS == 64
1897# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1898 do { \
1899 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1900 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1901 } while (0)
1902#else
1903# error HC_ARCH_BITS
1904#endif
1905
1906
1907/** @def ASMAtomicCmpXchgSize
1908 * Atomically Compare and Exchange a value which size might differ
1909 * between platforms or compilers, ordered.
1910 *
1911 * @param pu Pointer to the value to update.
1912 * @param uNew The new value to assigned to *pu.
1913 * @param uOld The old value to *pu compare with.
1914 * @param fRc Where to store the result.
1915 *
1916 * @remarks x86: Requires a 486 or later.
1917 * @todo Rename ASMAtomicCmpWriteSize
1918 */
1919#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1920 do { \
1921 switch (sizeof(*(pu))) { \
1922 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1923 break; \
1924 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1925 break; \
1926 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1927 (fRc) = false; \
1928 break; \
1929 } \
1930 } while (0)
1931
1932
1933/**
1934 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1935 * back old value, ordered.
1936 *
1937 * @returns true if xchg was done.
1938 * @returns false if xchg wasn't done.
1939 *
1940 * @param pu8 Pointer to the value to update.
1941 * @param u8New The new value to assigned to *pu32.
1942 * @param u8Old The old value to *pu8 compare with.
1943 * @param pu8Old Pointer store the old value at.
1944 *
1945 * @remarks x86: Requires a 486 or later.
1946 */
1947#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1948RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1949#else
1950DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1951{
1952# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1953# if RT_INLINE_ASM_GNU_STYLE
1954 uint8_t u8Ret;
1955 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1956 "setz %1\n\t"
1957 : "=m" (*pu8)
1958 , "=qm" (u8Ret)
1959 , "=a" (*pu8Old)
1960# if defined(RT_ARCH_X86)
1961 : "q" (u8New)
1962# else
1963 : "r" (u8New)
1964# endif
1965 , "a" (u8Old)
1966 , "m" (*pu8)
1967 : "cc");
1968 return (bool)u8Ret;
1969
1970# elif RT_INLINE_ASM_USES_INTRIN
1971 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1972
1973# else
1974 uint8_t u8Ret;
1975 __asm
1976 {
1977# ifdef RT_ARCH_AMD64
1978 mov rdx, [pu8]
1979# else
1980 mov edx, [pu8]
1981# endif
1982 mov eax, [u8Old]
1983 mov ecx, [u8New]
1984# ifdef RT_ARCH_AMD64
1985 lock cmpxchg [rdx], ecx
1986 mov rdx, [pu8Old]
1987 mov [rdx], eax
1988# else
1989 lock cmpxchg [edx], ecx
1990 mov edx, [pu8Old]
1991 mov [edx], eax
1992# endif
1993 setz al
1994 movzx eax, al
1995 mov [u8Ret], eax
1996 }
1997 return !!u8Ret;
1998# endif
1999
2000# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2001# if RT_INLINE_ASM_USES_INTRIN
2002# if defined(RTASM_ARM64_USE_FEAT_LSE)
2003# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2004 uint8_t const uOldActual = __casal8(pu8, u8Old, u8New);
2005# else
2006 __dmb(_ARM64_BARRIER_SY);
2007 uint8_t const uOldActual = __cas8(pu8, u8Old, u8New);
2008# endif
2009# else
2010 uint8_t const uOldActual = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old);
2011# endif
2012 *pu8Old = uOldActual;
2013 return uOldActual == u8Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2014
2015# else
2016 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
2017 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
2018# if defined(RTASM_ARM64_USE_FEAT_LSE)
2019 union { uint32_t u; bool f; } fXchg;
2020 uint32_t u32Actual;
2021 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
2022# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2023 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2024# else
2025 RTASM_ARM_DMB_SY
2026 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2027# endif
2028 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2029 "cset %w[fXchg], eq\n\t"
2030 : [pMem] "+Q" (*pu8)
2031 , [uOldActual] "=&r" (u32Actual)
2032 , [fXchg] "=&r" (fXchg.u)
2033 : [uNew] "r" ((uint32_t)u8New)
2034 , [uOldOrg] "r" ((uint32_t)u8Old)
2035 , "[uOldActual]" ((uint32_t)u8Old)
2036 : "cc");
2037 *pu8Old = (uint8_t)u32Actual;
2038# else
2039 union { uint8_t u; bool f; } fXchg;
2040 uint8_t u8ActualOld;
2041 uint8_t rcSpill;
2042 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
2043 RTASM_ARM_DMB_SY
2044# if defined(RT_ARCH_ARM64)
2045 "ldaxrb %w[uOld], %[pMem]\n\t"
2046 "cmp %w[uOld], %w[uCmp]\n\t"
2047 "bne 1f\n\t" /* stop here if not equal */
2048 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
2049 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2050 "mov %w[fXchg], #1\n\t"
2051 "1:\n\t"
2052 "clrex\n\t"
2053# else
2054 "ldrexb %[uOld], %[pMem]\n\t"
2055 "teq %[uOld], %[uCmp]\n\t"
2056 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
2057 "bne 1f\n\t" /* stop here if not equal */
2058 "cmp %[rc], #0\n\t"
2059 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2060 "mov %[fXchg], #1\n\t"
2061 "1:\n\t"
2062 /** @todo clrexne on armv7? */
2063# endif
2064 : [pMem] "+Q" (*pu8)
2065 , [uOld] "=&r" (u8ActualOld)
2066 , [rc] "=&r" (rcSpill)
2067 , [fXchg] "=&r" (fXchg.u)
2068 : [uCmp] "r" (u8Old)
2069 , [uNew] "r" (u8New)
2070 , "[fXchg]" (0)
2071 RTASM_ARM_DMB_SY_COMMA_IN_REG
2072 : "cc");
2073 *pu8Old = u8ActualOld;
2074# endif
2075 return fXchg.f;
2076# endif
2077
2078# else
2079# error "Port me"
2080# endif
2081}
2082#endif
2083
2084
2085/**
2086 * Atomically Compare and Exchange a signed 8-bit value, additionally
2087 * passes back old value, ordered.
2088 *
2089 * @returns true if xchg was done.
2090 * @returns false if xchg wasn't done.
2091 *
2092 * @param pi8 Pointer to the value to update.
2093 * @param i8New The new value to assigned to *pi8.
2094 * @param i8Old The old value to *pi8 compare with.
2095 * @param pi8Old Pointer store the old value at.
2096 *
2097 * @remarks x86: Requires a 486 or later.
2098 */
2099DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
2100{
2101 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
2102}
2103
2104
2105/**
2106 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
2107 * back old value, ordered.
2108 *
2109 * @returns true if xchg was done.
2110 * @returns false if xchg wasn't done.
2111 *
2112 * @param pu16 Pointer to the value to update.
2113 * @param u16New The new value to assigned to *pu16.
2114 * @param u16Old The old value to *pu32 compare with.
2115 * @param pu16Old Pointer store the old value at.
2116 *
2117 * @remarks x86: Requires a 486 or later.
2118 */
2119#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2120RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
2121#else
2122DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
2123{
2124# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2125# if RT_INLINE_ASM_GNU_STYLE
2126 uint8_t u8Ret;
2127 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
2128 "setz %1\n\t"
2129 : "=m" (*pu16)
2130 , "=qm" (u8Ret)
2131 , "=a" (*pu16Old)
2132 : "r" (u16New)
2133 , "a" (u16Old)
2134 , "m" (*pu16)
2135 : "cc");
2136 return (bool)u8Ret;
2137
2138# elif RT_INLINE_ASM_USES_INTRIN
2139 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
2140
2141# else
2142 uint16_t u16Ret;
2143 __asm
2144 {
2145# ifdef RT_ARCH_AMD64
2146 mov rdx, [pu16]
2147# else
2148 mov edx, [pu16]
2149# endif
2150 mov eax, [u16Old]
2151 mov ecx, [u16New]
2152# ifdef RT_ARCH_AMD64
2153 lock cmpxchg [rdx], ecx
2154 mov rdx, [pu16Old]
2155 mov [rdx], eax
2156# else
2157 lock cmpxchg [edx], ecx
2158 mov edx, [pu16Old]
2159 mov [edx], eax
2160# endif
2161 setz al
2162 movzx eax, al
2163 mov [u16Ret], eax
2164 }
2165 return !!u16Ret;
2166# endif
2167
2168# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2169# if RT_INLINE_ASM_USES_INTRIN
2170# if defined(RTASM_ARM64_USE_FEAT_LSE)
2171# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2172 uint16_t const uOldActual = __casal16(pu16, u16Old, u16New);
2173# else
2174 __dmb(_ARM64_BARRIER_SY);
2175 uint16_t const uOldActual = __cas16(pu16, u16Old, u16New);
2176# endif
2177# else
2178 uint16_t const uOldActual = _InterlockedCompareExchange16((char RT_FAR *)pu16, u16New, u16Old);
2179# endif
2180 *pu16Old = uOldActual;
2181 return uOldActual == u16Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2182
2183# else
2184 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2185 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2186# if defined(RTASM_ARM64_USE_FEAT_LSE)
2187 union { uint32_t u; bool f; } fXchg;
2188 uint32_t u32Actual;
2189 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2190# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2191 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2192# else
2193 RTASM_ARM_DMB_SY
2194 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2195# endif
2196 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2197 "cset %w[fXchg], eq\n\t"
2198 : [pMem] "+Q" (*pu16)
2199 , [uOldActual] "=&r" (u32Actual)
2200 , [fXchg] "=&r" (fXchg.u)
2201 : [uNew] "r" ((uint32_t)u16New)
2202 , [uOldOrg] "r" ((uint32_t)u16Old)
2203 , "[uOldActual]" ((uint32_t)u16Old)
2204 : "cc");
2205 *pu16Old = (uint16_t)u32Actual;
2206# else
2207 union { uint16_t u; bool f; } fXchg;
2208 uint16_t u16ActualOld;
2209 uint16_t rcSpill;
2210 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2211 RTASM_ARM_DMB_SY
2212# if defined(RT_ARCH_ARM64)
2213 "ldaxrh %w[uOld], %[pMem]\n\t"
2214 "cmp %w[uOld], %w[uCmp]\n\t"
2215 "bne 1f\n\t" /* stop here if not equal */
2216 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2217 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2218 "mov %w[fXchg], #1\n\t"
2219 "1:\n\t"
2220 "clrex\n\t"
2221# else
2222 "ldrexh %[uOld], %[pMem]\n\t"
2223 "teq %[uOld], %[uCmp]\n\t"
2224 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2225 "bne 1f\n\t" /* stop here if not equal */
2226 "cmp %[rc], #0\n\t"
2227 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2228 "mov %[fXchg], #1\n\t"
2229 "1:\n\t"
2230 /** @todo clrexne on armv7? */
2231# endif
2232 : [pMem] "+Q" (*pu16)
2233 , [uOld] "=&r" (u16ActualOld)
2234 , [rc] "=&r" (rcSpill)
2235 , [fXchg] "=&r" (fXchg.u)
2236 : [uCmp] "r" (u16Old)
2237 , [uNew] "r" (u16New)
2238 , "[fXchg]" (0)
2239 RTASM_ARM_DMB_SY_COMMA_IN_REG
2240 : "cc");
2241 *pu16Old = u16ActualOld;
2242# endif
2243 return fXchg.f;
2244# endif
2245
2246# else
2247# error "Port me"
2248# endif
2249}
2250#endif
2251
2252
2253/**
2254 * Atomically Compare and Exchange a signed 16-bit value, additionally
2255 * passes back old value, ordered.
2256 *
2257 * @returns true if xchg was done.
2258 * @returns false if xchg wasn't done.
2259 *
2260 * @param pi16 Pointer to the value to update.
2261 * @param i16New The new value to assigned to *pi16.
2262 * @param i16Old The old value to *pi16 compare with.
2263 * @param pi16Old Pointer store the old value at.
2264 *
2265 * @remarks x86: Requires a 486 or later.
2266 */
2267DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2268{
2269 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2270}
2271
2272
2273/**
2274 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2275 * passes back old value, ordered.
2276 *
2277 * @returns true if xchg was done.
2278 * @returns false if xchg wasn't done.
2279 *
2280 * @param pu32 Pointer to the value to update.
2281 * @param u32New The new value to assigned to *pu32.
2282 * @param u32Old The old value to *pu32 compare with.
2283 * @param pu32Old Pointer store the old value at.
2284 *
2285 * @remarks x86: Requires a 486 or later.
2286 */
2287#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2288RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2289#else
2290DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2291{
2292# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2293# if RT_INLINE_ASM_GNU_STYLE
2294 uint8_t u8Ret;
2295 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2296 "setz %1\n\t"
2297 : "=m" (*pu32)
2298 , "=qm" (u8Ret)
2299 , "=a" (*pu32Old)
2300 : "r" (u32New)
2301 , "a" (u32Old)
2302 , "m" (*pu32)
2303 : "cc");
2304 return (bool)u8Ret;
2305
2306# elif RT_INLINE_ASM_USES_INTRIN
2307 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2308
2309# else
2310 uint32_t u32Ret;
2311 __asm
2312 {
2313# ifdef RT_ARCH_AMD64
2314 mov rdx, [pu32]
2315# else
2316 mov edx, [pu32]
2317# endif
2318 mov eax, [u32Old]
2319 mov ecx, [u32New]
2320# ifdef RT_ARCH_AMD64
2321 lock cmpxchg [rdx], ecx
2322 mov rdx, [pu32Old]
2323 mov [rdx], eax
2324# else
2325 lock cmpxchg [edx], ecx
2326 mov edx, [pu32Old]
2327 mov [edx], eax
2328# endif
2329 setz al
2330 movzx eax, al
2331 mov [u32Ret], eax
2332 }
2333 return !!u32Ret;
2334# endif
2335
2336# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2337# if RT_INLINE_ASM_USES_INTRIN
2338# if defined(RTASM_ARM64_USE_FEAT_LSE)
2339# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2340 uint32_t const uOldActual = __casal32(pu32, u32Old, u32New);
2341# else
2342 __dmb(_ARM64_BARRIER_SY);
2343 uint32_t const uOldActual = __cas32(pu32, u32Old, u32New);
2344# endif
2345# else
2346 uint32_t const uOldActual = _InterlockedCompareExchange((char RT_FAR *)pu32, u32New, u32Old);
2347# endif
2348 *pu32Old = uOldActual;
2349 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2350
2351# else
2352
2353 union { uint32_t u; bool f; } fXchg;
2354 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2355 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2356# if defined(RTASM_ARM64_USE_FEAT_LSE)
2357 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2358# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2359 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2360# else
2361 RTASM_ARM_DMB_SY
2362 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2363# endif
2364 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2365 "cset %w[fXchg], eq\n\t"
2366 : [pMem] "+Q" (*pu32)
2367 , [uOldActual] "=&r" (*pu32Old)
2368 , [fXchg] "=&r" (fXchg.u)
2369 : [uNew] "r" (u32New)
2370 , [uOldOrg] "r" (u32Old)
2371 , "[uOldActual]" (u32Old)
2372 : "cc");
2373# else
2374 uint32_t u32ActualOld;
2375 uint32_t rcSpill;
2376 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2377 RTASM_ARM_DMB_SY
2378# if defined(RT_ARCH_ARM64)
2379 "ldaxr %w[uOld], %[pMem]\n\t"
2380 "cmp %w[uOld], %w[uCmp]\n\t"
2381 "bne 1f\n\t" /* stop here if not equal */
2382 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2383 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2384 "mov %w[fXchg], #1\n\t"
2385 "1:\n\t"
2386 "clrex\n\t"
2387# else
2388 "ldrex %[uOld], %[pMem]\n\t"
2389 "teq %[uOld], %[uCmp]\n\t"
2390 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2391 "bne 1f\n\t" /* stop here if not equal */
2392 "cmp %[rc], #0\n\t"
2393 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2394 "mov %[fXchg], #1\n\t"
2395 "1:\n\t"
2396 /** @todo clrexne on armv7? */
2397# endif
2398 : [pMem] "+Q" (*pu32)
2399 , [uOld] "=&r" (u32ActualOld)
2400 , [rc] "=&r" (rcSpill)
2401 , [fXchg] "=&r" (fXchg.u)
2402 : [uCmp] "r" (u32Old)
2403 , [uNew] "r" (u32New)
2404 , "[fXchg]" (0)
2405 RTASM_ARM_DMB_SY_COMMA_IN_REG
2406 : "cc");
2407 *pu32Old = u32ActualOld;
2408# endif
2409 return fXchg.f;
2410# endif
2411
2412# else
2413# error "Port me"
2414# endif
2415}
2416#endif
2417
2418
2419/**
2420 * Atomically Compare and Exchange a signed 32-bit value, additionally
2421 * passes back old value, ordered.
2422 *
2423 * @returns true if xchg was done.
2424 * @returns false if xchg wasn't done.
2425 *
2426 * @param pi32 Pointer to the value to update.
2427 * @param i32New The new value to assigned to *pi32.
2428 * @param i32Old The old value to *pi32 compare with.
2429 * @param pi32Old Pointer store the old value at.
2430 *
2431 * @remarks x86: Requires a 486 or later.
2432 */
2433DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2434{
2435 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2436}
2437
2438
2439/**
2440 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2441 * passing back old value, ordered.
2442 *
2443 * @returns true if xchg was done.
2444 * @returns false if xchg wasn't done.
2445 *
2446 * @param pu64 Pointer to the 64-bit variable to update.
2447 * @param u64New The 64-bit value to assign to *pu64.
2448 * @param u64Old The value to compare with.
2449 * @param pu64Old Pointer store the old value at.
2450 *
2451 * @remarks x86: Requires a Pentium or later.
2452 */
2453#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2454 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2455RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2456#else
2457DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2458{
2459# if RT_INLINE_ASM_USES_INTRIN
2460 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2461
2462# elif defined(RT_ARCH_AMD64)
2463# if RT_INLINE_ASM_GNU_STYLE
2464 uint8_t u8Ret;
2465 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2466 "setz %1\n\t"
2467 : "=m" (*pu64)
2468 , "=qm" (u8Ret)
2469 , "=a" (*pu64Old)
2470 : "r" (u64New)
2471 , "a" (u64Old)
2472 , "m" (*pu64)
2473 : "cc");
2474 return (bool)u8Ret;
2475# else
2476 bool fRet;
2477 __asm
2478 {
2479 mov rdx, [pu32]
2480 mov rax, [u64Old]
2481 mov rcx, [u64New]
2482 lock cmpxchg [rdx], rcx
2483 mov rdx, [pu64Old]
2484 mov [rdx], rax
2485 setz al
2486 mov [fRet], al
2487 }
2488 return fRet;
2489# endif
2490
2491# elif defined(RT_ARCH_X86)
2492# if RT_INLINE_ASM_GNU_STYLE
2493 uint64_t u64Ret;
2494# if defined(PIC) || defined(__PIC__)
2495 /* Note #1: This code uses a memory clobber description, because the clean
2496 solution with an output value for *pu64 makes gcc run out of
2497 registers. This will cause suboptimal code, and anyone with a
2498 better solution is welcome to improve this.
2499
2500 Note #2: We must prevent gcc from encoding the memory access, as it
2501 may go via the GOT if we're working on a global variable (like
2502 in the testcase). Thus we request a register (%3) and
2503 dereference it ourselves. */
2504 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2505 "lock; cmpxchg8b (%3)\n\t"
2506 "xchgl %%ebx, %1\n\t"
2507 : "=A" (u64Ret)
2508 : "DS" ((uint32_t)u64New)
2509 , "c" ((uint32_t)(u64New >> 32))
2510 , "r" (pu64) /* Do not use "m" here*/
2511 , "0" (u64Old)
2512 : "memory"
2513 , "cc" );
2514# else /* !PIC */
2515 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2516 : "=A" (u64Ret)
2517 , "=m" (*pu64)
2518 : "b" ((uint32_t)u64New)
2519 , "c" ((uint32_t)(u64New >> 32))
2520 , "m" (*pu64)
2521 , "0" (u64Old)
2522 : "cc");
2523# endif
2524 *pu64Old = u64Ret;
2525 return u64Ret == u64Old;
2526# else
2527 uint32_t u32Ret;
2528 __asm
2529 {
2530 mov ebx, dword ptr [u64New]
2531 mov ecx, dword ptr [u64New + 4]
2532 mov edi, [pu64]
2533 mov eax, dword ptr [u64Old]
2534 mov edx, dword ptr [u64Old + 4]
2535 lock cmpxchg8b [edi]
2536 mov ebx, [pu64Old]
2537 mov [ebx], eax
2538 setz al
2539 movzx eax, al
2540 add ebx, 4
2541 mov [ebx], edx
2542 mov dword ptr [u32Ret], eax
2543 }
2544 return !!u32Ret;
2545# endif
2546
2547# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2548 union { uint32_t u; bool f; } fXchg;
2549 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2550 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2551# if defined(RTASM_ARM64_USE_FEAT_LSE)
2552 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2553# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2554 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2555# else
2556 RTASM_ARM_DMB_SY
2557 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2558# endif
2559 "cmp %[uOldActual], %[uOldOrg]\n\t"
2560 "cset %w[fXchg], eq\n\t"
2561 : [pMem] "+Q" (*pu64)
2562 , [uOldActual] "=&r" (*pu64Old)
2563 , [fXchg] "=&r" (fXchg.u)
2564 : [uNew] "r" (u64New)
2565 , [uOldOrg] "r" (u64Old)
2566 , "[uOldActual]" (u64Old)
2567 : "cc");
2568# else
2569 uint64_t u64ActualOld;
2570 uint32_t rcSpill;
2571 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2572 RTASM_ARM_DMB_SY
2573# if defined(RT_ARCH_ARM64)
2574 "ldaxr %[uOld], %[pMem]\n\t"
2575 "cmp %[uOld], %[uCmp]\n\t"
2576 "bne 1f\n\t" /* stop here if not equal */
2577 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2578 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2579 "mov %w[fXchg], #1\n\t"
2580 "1:\n\t"
2581 "clrex\n\t"
2582# else
2583 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2584 "teq %[uOld], %[uCmp]\n\t"
2585 "teqeq %H[uOld], %H[uCmp]\n\t"
2586 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2587 "bne 1f\n\t" /* stop here if not equal */
2588 "cmp %[rc], #0\n\t"
2589 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2590 "mov %[fXchg], #1\n\t"
2591 "1:\n\t"
2592 /** @todo clrexne on armv7? */
2593# endif
2594 : [pMem] "+Q" (*pu64)
2595 , [uOld] "=&r" (u64ActualOld)
2596 , [rc] "=&r" (rcSpill)
2597 , [fXchg] "=&r" (fXchg.u)
2598 : [uCmp] "r" (u64Old)
2599 , [uNew] "r" (u64New)
2600 , "[fXchg]" (0)
2601 RTASM_ARM_DMB_SY_COMMA_IN_REG
2602 : "cc");
2603 *pu64Old = u64ActualOld;
2604# endif
2605 return fXchg.f;
2606
2607# else
2608# error "Port me"
2609# endif
2610}
2611#endif
2612
2613
2614/**
2615 * Atomically Compare and exchange a signed 64-bit value, additionally
2616 * passing back old value, ordered.
2617 *
2618 * @returns true if xchg was done.
2619 * @returns false if xchg wasn't done.
2620 *
2621 * @param pi64 Pointer to the 64-bit variable to update.
2622 * @param i64 The 64-bit value to assign to *pu64.
2623 * @param i64Old The value to compare with.
2624 * @param pi64Old Pointer store the old value at.
2625 *
2626 * @remarks x86: Requires a Pentium or later.
2627 */
2628DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2629{
2630 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2631}
2632
2633#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2634
2635/** @def RTASM_HAVE_CMP_XCHG_U128
2636 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2637 * and ASMAtomicCmpSwapExU128() available. */
2638# define RTASM_HAVE_CMP_XCHG_U128 1
2639
2640
2641/**
2642 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2643 *
2644 * @returns true if exchange was done.
2645 * @returns false if exchange wasn't done.
2646 *
2647 * @param pu128 Pointer to the 128-bit variable to update.
2648 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2649 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2650 * @param u64OldHi The high 64-bit of the value to compare with.
2651 * @param u64OldLo The low 64-bit of the value to compare with.
2652 * @param pu128Old Where to return the old value.
2653 *
2654 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2655 */
2656# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2657DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2658 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2659# else
2660DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2661 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2662{
2663# if RT_INLINE_ASM_USES_INTRIN
2664 pu128Old->Hi = u64OldHi;
2665 pu128Old->Lo = u64OldLo;
2666 AssertCompileMemberOffset(uint128_t, Lo, 0);
2667 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2668
2669# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2670 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2671 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2672 *pu128Old = uOld;
2673 return uCmp == uOld;
2674
2675# elif defined(RT_ARCH_AMD64)
2676# if RT_INLINE_ASM_GNU_STYLE
2677 uint8_t bRet;
2678 uint64_t u64RetHi, u64RetLo;
2679 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2680 "setz %b0\n\t"
2681 : "=r" (bRet)
2682 , "=a" (u64RetLo)
2683 , "=d" (u64RetHi)
2684 , "+m" (*pu128)
2685 : "a" (u64OldLo)
2686 , "d" (u64OldHi)
2687 , "b" (u64NewLo)
2688 , "c" (u64NewHi)
2689 : "cc");
2690 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2691 return (bool)bRet;
2692# else
2693# error "Port me"
2694# endif
2695# else
2696# error "Port me"
2697# endif
2698}
2699# endif
2700
2701
2702/**
2703 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2704 *
2705 * @returns true if exchange was done.
2706 * @returns false if exchange wasn't done.
2707 *
2708 * @param pu128 Pointer to the 128-bit variable to update.
2709 * @param u128New The 128-bit value to assign to *pu128.
2710 * @param u128Old The value to compare with.
2711 * @param pu128Old Where to return the old value.
2712 *
2713 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2714 */
2715DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2716 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2717{
2718# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2719# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2720 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2721 *pu128Old = uSwapped;
2722 return uSwapped == u128Old;
2723# else
2724 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2725 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2726# endif
2727# else
2728 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2729# endif
2730}
2731
2732
2733/**
2734 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2735 */
2736DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2737 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2738{
2739# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2740 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2741# else
2742 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2743# endif
2744}
2745
2746#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2747
2748
2749
2750/** @def ASMAtomicCmpXchgExHandle
2751 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2752 *
2753 * @param ph Pointer to the value to update.
2754 * @param hNew The new value to assigned to *pu.
2755 * @param hOld The old value to *pu compare with.
2756 * @param fRc Where to store the result.
2757 * @param phOldVal Pointer to where to store the old value.
2758 *
2759 * @remarks This doesn't currently work for all handles (like RTFILE).
2760 */
2761#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2762# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2763 do { \
2764 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2765 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2766 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2767 } while (0)
2768#elif HC_ARCH_BITS == 64
2769# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2770 do { \
2771 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2772 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2773 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2774 } while (0)
2775#else
2776# error HC_ARCH_BITS
2777#endif
2778
2779
2780/** @def ASMAtomicCmpXchgExSize
2781 * Atomically Compare and Exchange a value which size might differ
2782 * between platforms or compilers. Additionally passes back old value.
2783 *
2784 * @param pu Pointer to the value to update.
2785 * @param uNew The new value to assigned to *pu.
2786 * @param uOld The old value to *pu compare with.
2787 * @param fRc Where to store the result.
2788 * @param puOldVal Pointer to where to store the old value.
2789 *
2790 * @remarks x86: Requires a 486 or later.
2791 */
2792#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2793 do { \
2794 switch (sizeof(*(pu))) { \
2795 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2796 break; \
2797 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2798 break; \
2799 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2800 (fRc) = false; \
2801 (uOldVal) = 0; \
2802 break; \
2803 } \
2804 } while (0)
2805
2806
2807/**
2808 * Atomically Compare and Exchange a pointer value, additionally
2809 * passing back old value, ordered.
2810 *
2811 * @returns true if xchg was done.
2812 * @returns false if xchg wasn't done.
2813 *
2814 * @param ppv Pointer to the value to update.
2815 * @param pvNew The new value to assigned to *ppv.
2816 * @param pvOld The old value to *ppv compare with.
2817 * @param ppvOld Pointer store the old value at.
2818 *
2819 * @remarks x86: Requires a 486 or later.
2820 */
2821DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2822 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2823{
2824#if ARCH_BITS == 32 || ARCH_BITS == 16
2825 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2826#elif ARCH_BITS == 64
2827 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2828#else
2829# error "ARCH_BITS is bogus"
2830#endif
2831}
2832
2833
2834/**
2835 * Atomically Compare and Exchange a pointer value, additionally
2836 * passing back old value, ordered.
2837 *
2838 * @returns true if xchg was done.
2839 * @returns false if xchg wasn't done.
2840 *
2841 * @param ppv Pointer to the value to update.
2842 * @param pvNew The new value to assigned to *ppv.
2843 * @param pvOld The old value to *ppv compare with.
2844 * @param ppvOld Pointer store the old value at.
2845 *
2846 * @remarks This is relatively type safe on GCC platforms.
2847 * @remarks x86: Requires a 486 or later.
2848 */
2849#ifdef __GNUC__
2850# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2851 __extension__ \
2852 ({\
2853 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2854 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2855 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2856 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2857 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2858 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2859 (void **)ppvOldTypeChecked); \
2860 fMacroRet; \
2861 })
2862#else
2863# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2864 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2865#endif
2866
2867
2868/**
2869 * Virtualization unfriendly serializing instruction, always exits.
2870 */
2871#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2872RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2873#else
2874DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2875{
2876# if RT_INLINE_ASM_GNU_STYLE
2877 RTCCUINTREG xAX = 0;
2878# ifdef RT_ARCH_AMD64
2879 __asm__ __volatile__ ("cpuid"
2880 : "=a" (xAX)
2881 : "0" (xAX)
2882 : "rbx", "rcx", "rdx", "memory");
2883# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2884 __asm__ __volatile__ ("push %%ebx\n\t"
2885 "cpuid\n\t"
2886 "pop %%ebx\n\t"
2887 : "=a" (xAX)
2888 : "0" (xAX)
2889 : "ecx", "edx", "memory");
2890# else
2891 __asm__ __volatile__ ("cpuid"
2892 : "=a" (xAX)
2893 : "0" (xAX)
2894 : "ebx", "ecx", "edx", "memory");
2895# endif
2896
2897# elif RT_INLINE_ASM_USES_INTRIN
2898 int aInfo[4];
2899 _ReadWriteBarrier();
2900 __cpuid(aInfo, 0);
2901
2902# else
2903 __asm
2904 {
2905 push ebx
2906 xor eax, eax
2907 cpuid
2908 pop ebx
2909 }
2910# endif
2911}
2912#endif
2913
2914/**
2915 * Virtualization friendly serializing instruction, though more expensive.
2916 */
2917#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2918RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2919#else
2920DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2921{
2922# if RT_INLINE_ASM_GNU_STYLE
2923# ifdef RT_ARCH_AMD64
2924 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2925 "subq $128, %%rsp\n\t" /*redzone*/
2926 "mov %%ss, %%eax\n\t"
2927 "pushq %%rax\n\t"
2928 "pushq %%r10\n\t"
2929 "pushfq\n\t"
2930 "movl %%cs, %%eax\n\t"
2931 "pushq %%rax\n\t"
2932 "leaq 1f(%%rip), %%rax\n\t"
2933 "pushq %%rax\n\t"
2934 "iretq\n\t"
2935 "1:\n\t"
2936 ::: "rax", "r10", "memory", "cc");
2937# else
2938 __asm__ __volatile__ ("pushfl\n\t"
2939 "pushl %%cs\n\t"
2940 "pushl $1f\n\t"
2941 "iretl\n\t"
2942 "1:\n\t"
2943 ::: "memory");
2944# endif
2945
2946# else
2947 __asm
2948 {
2949 pushfd
2950 push cs
2951 push la_ret
2952 iretd
2953 la_ret:
2954 }
2955# endif
2956}
2957#endif
2958
2959/**
2960 * Virtualization friendlier serializing instruction, may still cause exits.
2961 */
2962#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2963RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2964#else
2965DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2966{
2967# if RT_INLINE_ASM_GNU_STYLE
2968 /* rdtscp is not supported by ancient linux build VM of course :-( */
2969# ifdef RT_ARCH_AMD64
2970 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2971 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2972# else
2973 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2974 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2975# endif
2976# else
2977# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2978 uint32_t uIgnore;
2979 _ReadWriteBarrier();
2980 (void)__rdtscp(&uIgnore);
2981 (void)uIgnore;
2982# else
2983 __asm
2984 {
2985 rdtscp
2986 }
2987# endif
2988# endif
2989}
2990#endif
2991
2992
2993/**
2994 * Serialize Instruction (both data store and instruction flush).
2995 */
2996#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2997# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2998#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2999# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
3000#elif defined(RT_ARCH_SPARC64)
3001RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
3002#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3003DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
3004{
3005# if RT_INLINE_ASM_USES_INTRIN
3006 __dsb(_ARM64_BARRIER_SY);
3007# else
3008 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
3009# endif
3010}
3011#else
3012# error "Port me"
3013#endif
3014
3015
3016/**
3017 * Memory fence, waits for any pending writes and reads to complete.
3018 * @note No implicit compiler barrier (which is probably stupid).
3019 */
3020DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
3021{
3022#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3023# if RT_INLINE_ASM_GNU_STYLE
3024 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
3025# elif RT_INLINE_ASM_USES_INTRIN
3026 _mm_mfence();
3027# else
3028 __asm
3029 {
3030 _emit 0x0f
3031 _emit 0xae
3032 _emit 0xf0
3033 }
3034# endif
3035#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3036# if RT_INLINE_ASM_USES_INTRIN
3037 __dmb(_ARM64_BARRIER_SY);
3038# else
3039 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
3040# endif
3041#elif ARCH_BITS == 16
3042 uint16_t volatile u16;
3043 ASMAtomicXchgU16(&u16, 0);
3044#else
3045 uint32_t volatile u32;
3046 ASMAtomicXchgU32(&u32, 0);
3047#endif
3048}
3049
3050
3051/**
3052 * Write fence, waits for any pending writes to complete.
3053 * @note No implicit compiler barrier (which is probably stupid).
3054 */
3055DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
3056{
3057#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3058# if RT_INLINE_ASM_GNU_STYLE
3059 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
3060# elif RT_INLINE_ASM_USES_INTRIN
3061 _mm_sfence();
3062# else
3063 __asm
3064 {
3065 _emit 0x0f
3066 _emit 0xae
3067 _emit 0xf8
3068 }
3069# endif
3070#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3071# if RT_INLINE_ASM_USES_INTRIN
3072 __dmb(_ARM64_BARRIER_ST);
3073# else
3074 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
3075# endif
3076#else
3077 ASMMemoryFence();
3078#endif
3079}
3080
3081
3082/**
3083 * Read fence, waits for any pending reads to complete.
3084 * @note No implicit compiler barrier (which is probably stupid).
3085 */
3086DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
3087{
3088#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3089# if RT_INLINE_ASM_GNU_STYLE
3090 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
3091# elif RT_INLINE_ASM_USES_INTRIN
3092 _mm_lfence();
3093# else
3094 __asm
3095 {
3096 _emit 0x0f
3097 _emit 0xae
3098 _emit 0xe8
3099 }
3100# endif
3101#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3102# if RT_INLINE_ASM_USES_INTRIN
3103 __dmb(_ARM64_BARRIER_LD);
3104# else
3105 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
3106# endif
3107#else
3108 ASMMemoryFence();
3109#endif
3110}
3111
3112
3113/**
3114 * Atomically reads an unsigned 8-bit value, ordered.
3115 *
3116 * @returns Current *pu8 value
3117 * @param pu8 Pointer to the 8-bit variable to read.
3118 */
3119DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3120{
3121#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3122# if RT_INLINE_ASM_USES_INTRIN
3123 return __load_acquire8(pu8);
3124
3125# else
3126 /** @todo check out using ldarb (like __load_acquire8). */
3127 uint32_t u32;
3128# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
3129 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3130 RTASM_ARM_DMB_SY
3131 "casab %w[uDst], wzr, %[pMem]\n\t"
3132 : [uDst] "=&r" (u32)
3133 : [pMem] "Q" (*pu8),
3134 "0" (0)
3135 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3136# else
3137 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3138 RTASM_ARM_DMB_SY
3139# if defined(RT_ARCH_ARM64)
3140# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
3141 "ldurb %w[uDst], %[pMem]\n\t"
3142# else
3143 "ldxrb %w[uDst], %[pMem]\n\t"
3144 "clrex\n\t"
3145# endif
3146# else
3147 "ldrexb %[uDst], %[pMem]\n\t"
3148 /** @todo clrex */
3149# endif
3150 : [uDst] "=&r" (u32)
3151 : [pMem] "Q" (*pu8)
3152 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3153# endif
3154 return (uint8_t)u32;
3155# endif
3156
3157#else
3158 ASMMemoryFence();
3159 return *pu8; /* byte reads are atomic on x86 */
3160#endif
3161}
3162
3163
3164/**
3165 * Atomically reads an unsigned 8-bit value, unordered.
3166 *
3167 * @returns Current *pu8 value
3168 * @param pu8 Pointer to the 8-bit variable to read.
3169 */
3170DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3171{
3172#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3173# if RT_INLINE_ASM_USES_INTRIN
3174 return (uint8_t)__iso_volatile_load8((volatile char *)pu8); /* (emits ldrsb, sign-extending it to 32-bit) */
3175
3176# else
3177 uint32_t u32;
3178 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
3179# if defined(RT_ARCH_ARM64)
3180 "ldurb %w[uDst], %[pMem]\n\t"
3181# else
3182 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
3183# endif
3184 : [uDst] "=&r" (u32)
3185 : [pMem] "Q" (*pu8));
3186 return (uint8_t)u32;
3187# endif
3188
3189#else
3190 return *pu8; /* byte reads are atomic on x86 */
3191#endif
3192}
3193
3194
3195/**
3196 * Atomically reads a signed 8-bit value, ordered.
3197 *
3198 * @returns Current *pi8 value
3199 * @param pi8 Pointer to the 8-bit variable to read.
3200 */
3201DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3202{
3203#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3204 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3205#else
3206 ASMMemoryFence();
3207 return *pi8; /* byte reads are atomic on x86 */
3208#endif
3209}
3210
3211
3212/**
3213 * Atomically reads a signed 8-bit value, unordered.
3214 *
3215 * @returns Current *pi8 value
3216 * @param pi8 Pointer to the 8-bit variable to read.
3217 */
3218DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3219{
3220#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3221# if RT_INLINE_ASM_USES_INTRIN
3222 return __iso_volatile_load8((volatile const char *)pi8);
3223
3224# else
3225 int32_t i32;
3226 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3227# if defined(RT_ARCH_ARM64)
3228 "ldurb %w[iDst], %[pMem]\n\t"
3229# else
3230 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3231# endif
3232 : [iDst] "=&r" (i32)
3233 : [pMem] "Q" (*pi8));
3234 return (int8_t)i32;
3235# endif
3236
3237#else
3238 return *pi8; /* byte reads are atomic on x86 */
3239#endif
3240}
3241
3242
3243/**
3244 * Atomically reads an unsigned 16-bit value, ordered.
3245 *
3246 * @returns Current *pu16 value
3247 * @param pu16 Pointer to the 16-bit variable to read.
3248 */
3249DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3250{
3251 Assert(!((uintptr_t)pu16 & 1));
3252#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3253# if RT_INLINE_ASM_USES_INTRIN
3254 return __load_acquire16(pu16);
3255
3256# else
3257 /** @todo check out using ldarh (like __load_acquire16). */
3258 uint32_t u32;
3259# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3260 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3261 RTASM_ARM_DMB_SY
3262 "casah %w[uDst], wzr, %[pMem]\n\t"
3263 : [uDst] "=&r" (u32)
3264 : [pMem] "Q" (*pu16),
3265 "0" (0)
3266 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3267# else
3268 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3269 RTASM_ARM_DMB_SY
3270# if defined(RT_ARCH_ARM64)
3271# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3272 "ldurh %w[uDst], %[pMem]\n\t"
3273# else
3274 "ldxrh %w[uDst], %[pMem]\n\t"
3275 "clrex\n\t"
3276# endif
3277# else
3278 "ldrexh %[uDst], %[pMem]\n\t"
3279 /** @todo clrex */
3280# endif
3281 : [uDst] "=&r" (u32)
3282 : [pMem] "Q" (*pu16)
3283 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3284# endif
3285 return (uint16_t)u32;
3286# endif
3287
3288#else
3289 ASMMemoryFence();
3290 return *pu16;
3291#endif
3292}
3293
3294
3295/**
3296 * Atomically reads an unsigned 16-bit value, unordered.
3297 *
3298 * @returns Current *pu16 value
3299 * @param pu16 Pointer to the 16-bit variable to read.
3300 */
3301DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3302{
3303 Assert(!((uintptr_t)pu16 & 1));
3304#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3305# if RT_INLINE_ASM_USES_INTRIN
3306 return (uint16_t)__iso_volatile_load16((volatile int16_t *)pu16); /* (emits ldrsh, sign-extending it to 32-bit) */
3307
3308# else
3309 uint32_t u32;
3310 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3311# if defined(RT_ARCH_ARM64)
3312 "ldurh %w[uDst], %[pMem]\n\t"
3313# else
3314 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3315# endif
3316 : [uDst] "=&r" (u32)
3317 : [pMem] "Q" (*pu16));
3318 return (uint16_t)u32;
3319# endif
3320
3321#else
3322 return *pu16;
3323#endif
3324}
3325
3326
3327/**
3328 * Atomically reads a signed 16-bit value, ordered.
3329 *
3330 * @returns Current *pi16 value
3331 * @param pi16 Pointer to the 16-bit variable to read.
3332 */
3333DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3334{
3335 Assert(!((uintptr_t)pi16 & 1));
3336#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3337 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3338#else
3339 ASMMemoryFence();
3340 return *pi16;
3341#endif
3342}
3343
3344
3345/**
3346 * Atomically reads a signed 16-bit value, unordered.
3347 *
3348 * @returns Current *pi16 value
3349 * @param pi16 Pointer to the 16-bit variable to read.
3350 */
3351DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3352{
3353 Assert(!((uintptr_t)pi16 & 1));
3354#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3355# if RT_INLINE_ASM_USES_INTRIN
3356 return __iso_volatile_load16(pi16);
3357
3358# else
3359 int32_t i32;
3360 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3361# if defined(RT_ARCH_ARM64)
3362 "ldurh %w[iDst], %[pMem]\n\t"
3363# else
3364 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3365# endif
3366 : [iDst] "=&r" (i32)
3367 : [pMem] "Q" (*pi16));
3368 return (int16_t)i32;
3369# endif
3370
3371#else
3372 return *pi16;
3373#endif
3374}
3375
3376
3377/**
3378 * Atomically reads an unsigned 32-bit value, ordered.
3379 *
3380 * @returns Current *pu32 value
3381 * @param pu32 Pointer to the 32-bit variable to read.
3382 */
3383DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3384{
3385 Assert(!((uintptr_t)pu32 & 3));
3386#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3387# if RT_INLINE_ASM_USES_INTRIN
3388 return (uint32_t)__load_acquire32(pu32);
3389
3390# else
3391 /** @todo check out using ldar (like __load_acquire32). */
3392 uint32_t u32;
3393# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3394 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3395 RTASM_ARM_DMB_SY
3396 "casa %w[uDst], wzr, %[pMem]\n\t"
3397 : [uDst] "=&r" (u32)
3398 : [pMem] "Q" (*pu32),
3399 "0" (0)
3400 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3401# else
3402 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3403 RTASM_ARM_DMB_SY
3404# if defined(RT_ARCH_ARM64)
3405# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3406 "ldur %w[uDst], %[pMem]\n\t"
3407# else
3408 "ldxr %w[uDst], %[pMem]\n\t"
3409 "clrex\n\t"
3410# endif
3411# else
3412 "ldrex %[uDst], %[pMem]\n\t"
3413 /** @todo clrex */
3414# endif
3415 : [uDst] "=&r" (u32)
3416 : [pMem] "Q" (*pu32)
3417 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3418# endif
3419 return u32;
3420# endif
3421
3422#else
3423 ASMMemoryFence();
3424# if ARCH_BITS == 16
3425 AssertFailed(); /** @todo 16-bit */
3426# endif
3427 return *pu32;
3428#endif
3429}
3430
3431
3432/**
3433 * Atomically reads an unsigned 32-bit value, unordered.
3434 *
3435 * @returns Current *pu32 value
3436 * @param pu32 Pointer to the 32-bit variable to read.
3437 */
3438DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3439{
3440 Assert(!((uintptr_t)pu32 & 3));
3441#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3442# if RT_INLINE_ASM_USES_INTRIN
3443 return (uint32_t)__iso_volatile_load32((volatile int32_t *)pu32);
3444
3445# else
3446 uint32_t u32;
3447 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3448# if defined(RT_ARCH_ARM64)
3449 "ldur %w[uDst], %[pMem]\n\t"
3450# else
3451 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3452# endif
3453 : [uDst] "=&r" (u32)
3454 : [pMem] "Q" (*pu32));
3455 return u32;
3456# endif
3457
3458#else
3459# if ARCH_BITS == 16
3460 AssertFailed(); /** @todo 16-bit */
3461# endif
3462 return *pu32;
3463#endif
3464}
3465
3466
3467/**
3468 * Atomically reads a signed 32-bit value, ordered.
3469 *
3470 * @returns Current *pi32 value
3471 * @param pi32 Pointer to the 32-bit variable to read.
3472 */
3473DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3474{
3475 Assert(!((uintptr_t)pi32 & 3));
3476#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3477 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3478#else
3479 ASMMemoryFence();
3480# if ARCH_BITS == 16
3481 AssertFailed(); /** @todo 16-bit */
3482# endif
3483 return *pi32;
3484#endif
3485}
3486
3487
3488/**
3489 * Atomically reads a signed 32-bit value, unordered.
3490 *
3491 * @returns Current *pi32 value
3492 * @param pi32 Pointer to the 32-bit variable to read.
3493 */
3494DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3495{
3496 Assert(!((uintptr_t)pi32 & 3));
3497#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3498# if RT_INLINE_ASM_USES_INTRIN
3499 return __iso_volatile_load32(pi32);
3500
3501# else
3502 int32_t i32;
3503 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3504# if defined(RT_ARCH_ARM64)
3505 "ldur %w[iDst], %[pMem]\n\t"
3506# else
3507 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3508# endif
3509 : [iDst] "=&r" (i32)
3510 : [pMem] "Q" (*pi32));
3511 return i32;
3512# endif
3513
3514#else
3515# if ARCH_BITS == 16
3516 AssertFailed(); /** @todo 16-bit */
3517# endif
3518 return *pi32;
3519#endif
3520}
3521
3522
3523/**
3524 * Atomically reads an unsigned 64-bit value, ordered.
3525 *
3526 * @returns Current *pu64 value
3527 * @param pu64 Pointer to the 64-bit variable to read.
3528 * The memory pointed to must be writable.
3529 *
3530 * @remarks This may fault if the memory is read-only!
3531 * @remarks x86: Requires a Pentium or later.
3532 */
3533#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3534 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3535RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3536#else
3537DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3538{
3539 uint64_t u64;
3540# ifdef RT_ARCH_AMD64
3541 Assert(!((uintptr_t)pu64 & 7));
3542/*# if RT_INLINE_ASM_GNU_STYLE
3543 __asm__ __volatile__( "mfence\n\t"
3544 "movq %1, %0\n\t"
3545 : "=r" (u64)
3546 : "m" (*pu64));
3547# else
3548 __asm
3549 {
3550 mfence
3551 mov rdx, [pu64]
3552 mov rax, [rdx]
3553 mov [u64], rax
3554 }
3555# endif*/
3556 ASMMemoryFence();
3557 u64 = *pu64;
3558
3559# elif defined(RT_ARCH_X86)
3560# if RT_INLINE_ASM_GNU_STYLE
3561# if defined(PIC) || defined(__PIC__)
3562 uint32_t u32EBX = 0;
3563 Assert(!((uintptr_t)pu64 & 7));
3564 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3565 "lock; cmpxchg8b (%5)\n\t"
3566 "movl %3, %%ebx\n\t"
3567 : "=A" (u64)
3568# if RT_GNUC_PREREQ(4, 3)
3569 , "+m" (*pu64)
3570# else
3571 , "=m" (*pu64)
3572# endif
3573 : "0" (0ULL)
3574 , "m" (u32EBX)
3575 , "c" (0)
3576 , "S" (pu64)
3577 : "cc");
3578# else /* !PIC */
3579 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3580 : "=A" (u64)
3581 , "+m" (*pu64)
3582 : "0" (0ULL)
3583 , "b" (0)
3584 , "c" (0)
3585 : "cc");
3586# endif
3587# else
3588 Assert(!((uintptr_t)pu64 & 7));
3589 __asm
3590 {
3591 xor eax, eax
3592 xor edx, edx
3593 mov edi, pu64
3594 xor ecx, ecx
3595 xor ebx, ebx
3596 lock cmpxchg8b [edi]
3597 mov dword ptr [u64], eax
3598 mov dword ptr [u64 + 4], edx
3599 }
3600# endif
3601
3602# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3603 Assert(!((uintptr_t)pu64 & 7));
3604
3605# if RT_INLINE_ASM_USES_INTRIN
3606 u64 = (uint64_t)__load_acquire64(pu64);
3607
3608# else
3609 /** @todo check out ldar (like __load_acquire64) */
3610# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3611 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3612 RTASM_ARM_DMB_SY
3613 "casa %[uDst], xzr, %[pMem]\n\t"
3614 : [uDst] "=&r" (u64)
3615 : [pMem] "Q" (*pu64),
3616 "0" (0)
3617 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3618# else
3619 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3620 RTASM_ARM_DMB_SY
3621# if defined(RT_ARCH_ARM64)
3622# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3623 "ldur %[uDst], %[pMem]\n\t"
3624# else
3625 "ldxr %[uDst], %[pMem]\n\t"
3626 "clrex\n\t"
3627# endif
3628# else
3629 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3630 /** @todo clrex */
3631# endif
3632 : [uDst] "=&r" (u64)
3633 : [pMem] "Q" (*pu64)
3634 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3635# endif
3636# endif
3637
3638# else
3639# error "Port me"
3640# endif
3641 return u64;
3642}
3643#endif
3644
3645
3646/**
3647 * Atomically reads an unsigned 64-bit value, unordered.
3648 *
3649 * @returns Current *pu64 value
3650 * @param pu64 Pointer to the 64-bit variable to read.
3651 * The memory pointed to must be writable.
3652 *
3653 * @remarks This may fault if the memory is read-only!
3654 * @remarks x86: Requires a Pentium or later.
3655 */
3656#if !defined(RT_ARCH_AMD64) \
3657 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3658 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3659RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3660#else
3661DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3662{
3663 uint64_t u64;
3664# ifdef RT_ARCH_AMD64
3665 Assert(!((uintptr_t)pu64 & 7));
3666/*# if RT_INLINE_ASM_GNU_STYLE
3667 Assert(!((uintptr_t)pu64 & 7));
3668 __asm__ __volatile__("movq %1, %0\n\t"
3669 : "=r" (u64)
3670 : "m" (*pu64));
3671# else
3672 __asm
3673 {
3674 mov rdx, [pu64]
3675 mov rax, [rdx]
3676 mov [u64], rax
3677 }
3678# endif */
3679 u64 = *pu64;
3680
3681# elif defined(RT_ARCH_X86)
3682# if RT_INLINE_ASM_GNU_STYLE
3683# if defined(PIC) || defined(__PIC__)
3684 uint32_t u32EBX = 0;
3685 uint32_t u32Spill;
3686 Assert(!((uintptr_t)pu64 & 7));
3687 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3688 "xor %%ecx,%%ecx\n\t"
3689 "xor %%edx,%%edx\n\t"
3690 "xchgl %%ebx, %3\n\t"
3691 "lock; cmpxchg8b (%4)\n\t"
3692 "movl %3, %%ebx\n\t"
3693 : "=A" (u64)
3694# if RT_GNUC_PREREQ(4, 3)
3695 , "+m" (*pu64)
3696# else
3697 , "=m" (*pu64)
3698# endif
3699 , "=c" (u32Spill)
3700 : "m" (u32EBX)
3701 , "S" (pu64)
3702 : "cc");
3703# else /* !PIC */
3704 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3705 : "=A" (u64)
3706 , "+m" (*pu64)
3707 : "0" (0ULL)
3708 , "b" (0)
3709 , "c" (0)
3710 : "cc");
3711# endif
3712# else
3713 Assert(!((uintptr_t)pu64 & 7));
3714 __asm
3715 {
3716 xor eax, eax
3717 xor edx, edx
3718 mov edi, pu64
3719 xor ecx, ecx
3720 xor ebx, ebx
3721 lock cmpxchg8b [edi]
3722 mov dword ptr [u64], eax
3723 mov dword ptr [u64 + 4], edx
3724 }
3725# endif
3726
3727# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3728 Assert(!((uintptr_t)pu64 & 7));
3729# if RT_INLINE_ASM_USES_INTRIN
3730 u64 = (uint64_t)__iso_volatile_load64((volatile int64_t *)pu64);
3731
3732# else
3733 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3734# if defined(RT_ARCH_ARM64)
3735 "ldur %[uDst], %[pMem]\n\t"
3736# else
3737 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3738 /** @todo clrex? */
3739# endif
3740 : [uDst] "=&r" (u64)
3741 : [pMem] "Q" (*pu64));
3742# endif
3743
3744# else
3745# error "Port me"
3746# endif
3747 return u64;
3748}
3749#endif
3750
3751
3752/**
3753 * Atomically reads a signed 64-bit value, ordered.
3754 *
3755 * @returns Current *pi64 value
3756 * @param pi64 Pointer to the 64-bit variable to read.
3757 * The memory pointed to must be writable.
3758 *
3759 * @remarks This may fault if the memory is read-only!
3760 * @remarks x86: Requires a Pentium or later.
3761 */
3762DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3763{
3764 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3765}
3766
3767
3768/**
3769 * Atomically reads a signed 64-bit value, unordered.
3770 *
3771 * @returns Current *pi64 value
3772 * @param pi64 Pointer to the 64-bit variable to read.
3773 * The memory pointed to must be writable.
3774 *
3775 * @remarks This will fault if the memory is read-only!
3776 * @remarks x86: Requires a Pentium or later.
3777 */
3778DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3779{
3780 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3781}
3782
3783
3784/** @def RTASM_HAVE_READ_U128
3785 * Defined in the target architecture supports atomic reading of 128-bit
3786 * integers.
3787 *
3788 * The define value is zero if both ordered and unordered reads are implemented
3789 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered reads are done natively
3790 * w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
3791 *
3792 * @note AMD64: Caller must check for cmpxchg16b support before use and make
3793 * sure variables are writable (won't be changed).
3794 * @sa RTASM_HAVE_CMP_XCHG_U128, RTASM_HAVE_WRITE_U128
3795 */
3796#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
3797# define RTASM_HAVE_READ_U128 3
3798#elif defined(RTASM_HAVE_CMP_XCHG_U128)
3799# define RTASM_HAVE_READ_U128 0
3800#endif
3801
3802#ifdef RTASM_HAVE_READ_U128
3803
3804/**
3805 * Atomically reads an unsigned 128-bit value, ordered.
3806 *
3807 * @returns Current *pu128 value
3808 * @param pu128 Pointer to the 128-bit variable to read.
3809 * The memory pointed to must be writable.
3810 *
3811 * @remarks AMD64: Requires the memory to be both readable and writable.
3812 * @remarks AMD64: Requires support for cmpxchg16b.
3813 */
3814DECLINLINE(uint128_t) ASMAtomicReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3815{
3816 RTUINT128U u128Ret;
3817 Assert(!((uintptr_t)pu128 & 15));
3818# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3819 __asm__ __volatile__("Lstart_ASMAtomicReadU128_%=:\n\t"
3820 RTASM_ARM_DMB_SY
3821 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3822 RTASM_ARM_DMB_SY
3823 : [uRetHi] "=r" (u128Ret.s.Hi)
3824 , [uRetLo] "=r" (u128Ret.s.Lo)
3825 : [pMem] "Q" (*pu128)
3826 : );
3827 return u128Ret.u;
3828# else
3829 ASMAtomicCmpXchgU128v2(pu128, 0, 0, 0, 0, &u128Ret.u);
3830 return u128Ret.u;
3831# endif
3832}
3833
3834/**
3835 * Atomically reads an unsigned 128-bit value, ordered.
3836 *
3837 * @returns Current *pu128 value
3838 * @param pu128 Pointer to the 128-bit variable to read.
3839 * The memory pointed to must be writable.
3840 *
3841 * @remarks AMD64: Requires the memory to be both readable and writable.
3842 * @remarks AMD64: Requires support for cmpxchg16b.
3843 */
3844DECLINLINE(RTUINT128U) ASMAtomicReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3845{
3846 RTUINT128U u128Ret;
3847 Assert(!((uintptr_t)pu128 & 15));
3848# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3849 __asm__ __volatile__("Lstart_ASMAtomicReadU128U_%=:\n\t"
3850 RTASM_ARM_DMB_SY
3851 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3852 RTASM_ARM_DMB_SY
3853 : [uRetHi] "=r" (u128Ret.s.Hi)
3854 , [uRetLo] "=r" (u128Ret.s.Lo)
3855 : [pMem] "Q" (*pu128)
3856 : );
3857 return u128Ret;
3858# else
3859 ASMAtomicCmpXchgU128v2(&pu128->u, 0, 0, 0, 0, &u128Ret.u);
3860 return u128Ret;
3861# endif
3862}
3863
3864
3865/**
3866 * Atomically reads an unsigned 128-bit value, unordered.
3867 *
3868 * @returns Current *pu128 value
3869 * @param pu128 Pointer to the 128-bit variable to read.
3870 * The memory pointed to must be writable.
3871 *
3872 * @remarks AMD64: Requires the memory to be both readable and writable.
3873 * @remarks AMD64: Requires support for cmpxchg16b.
3874 * @remarks AMD64: Is ordered.
3875 */
3876DECLINLINE(uint128_t) ASMAtomicUoReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3877{
3878 Assert(!((uintptr_t)pu128 & 15));
3879# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3880 RTUINT128U u128Ret;
3881 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128_%=:\n\t"
3882 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3883 : [uRetHi] "=r" (u128Ret.s.Hi)
3884 , [uRetLo] "=r" (u128Ret.s.Lo)
3885 : [pMem] "Q" (*pu128)
3886 : );
3887 return u128Ret.u;
3888
3889# elif defined(RT_ARCH_AMD64) && 0
3890 /* This doesn't work because __m128i can't be made volatile and we're not
3891 able to force MSC (2019) to emit _mm_load_si128 (besides it emits movdqu
3892 instead of movdqa). */
3893 __m128i uTmpSse = _mm_load_si128((__m128i volatile *)pu128);
3894 __m128i uTmpSseHi = _mm_srli_si128(uTmpSse, 64 / 8);
3895 RTUINT128U u128Ret;
3896 u128Ret.s.Lo = (uint64_t)_mm_cvtsi128_si64(uTmpSse);
3897 u128Ret.s.Hi = (uint64_t)_mm_cvtsi128_si64(uTmpSseHi);
3898 return u128Ret.u;
3899
3900# else
3901 return ASMAtomicReadU128(pu128);
3902# endif
3903}
3904
3905/**
3906 * Atomically reads an unsigned 128-bit value, unordered.
3907 *
3908 * @returns Current *pu128 value
3909 * @param pu128 Pointer to the 128-bit variable to read.
3910 * The memory pointed to must be writable.
3911 *
3912 * @remarks AMD64: Requires the memory to be both readable and writable.
3913 * @remarks AMD64: Requires support for cmpxchg16b.
3914 * @remarks AMD64: Is ordered.
3915 */
3916DECLINLINE(RTUINT128U) ASMAtomicUoReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3917{
3918 Assert(!((uintptr_t)pu128 & 15));
3919# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3920 RTUINT128U u128Ret;
3921 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128U_%=:\n\t"
3922 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3923 : [uRetHi] "=r" (u128Ret.s.Hi)
3924 , [uRetLo] "=r" (u128Ret.s.Lo)
3925 : [pMem] "Q" (*pu128)
3926 : );
3927 return u128Ret;
3928# else
3929 return ASMAtomicReadU128U(pu128);
3930# endif
3931}
3932
3933#endif /* RTASM_HAVE_READ_U128 */
3934
3935/**
3936 * Atomically reads a size_t value, ordered.
3937 *
3938 * @returns Current *pcb value
3939 * @param pcb Pointer to the size_t variable to read.
3940 */
3941DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3942{
3943#if ARCH_BITS == 64
3944 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3945#elif ARCH_BITS == 32
3946 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3947#elif ARCH_BITS == 16
3948 AssertCompileSize(size_t, 2);
3949 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3950#else
3951# error "Unsupported ARCH_BITS value"
3952#endif
3953}
3954
3955
3956/**
3957 * Atomically reads a size_t value, unordered.
3958 *
3959 * @returns Current *pcb value
3960 * @param pcb Pointer to the size_t variable to read.
3961 */
3962DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3963{
3964#if ARCH_BITS == 64 || ARCH_BITS == 16
3965 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3966#elif ARCH_BITS == 32
3967 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3968#elif ARCH_BITS == 16
3969 AssertCompileSize(size_t, 2);
3970 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3971#else
3972# error "Unsupported ARCH_BITS value"
3973#endif
3974}
3975
3976
3977/**
3978 * Atomically reads a pointer value, ordered.
3979 *
3980 * @returns Current *pv value
3981 * @param ppv Pointer to the pointer variable to read.
3982 *
3983 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3984 * requires less typing (no casts).
3985 */
3986DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3987{
3988#if ARCH_BITS == 32 || ARCH_BITS == 16
3989 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3990#elif ARCH_BITS == 64
3991 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3992#else
3993# error "ARCH_BITS is bogus"
3994#endif
3995}
3996
3997/**
3998 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3999 *
4000 * @returns Current *pv value
4001 * @param ppv Pointer to the pointer variable to read.
4002 * @param Type The type of *ppv, sans volatile.
4003 */
4004#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4005# define ASMAtomicReadPtrT(ppv, Type) \
4006 __extension__ \
4007 ({\
4008 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
4009 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
4010 pvTypeChecked; \
4011 })
4012#else
4013# define ASMAtomicReadPtrT(ppv, Type) \
4014 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4015#endif
4016
4017
4018/**
4019 * Atomically reads a pointer value, unordered.
4020 *
4021 * @returns Current *pv value
4022 * @param ppv Pointer to the pointer variable to read.
4023 *
4024 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
4025 * requires less typing (no casts).
4026 */
4027DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
4028{
4029#if ARCH_BITS == 32 || ARCH_BITS == 16
4030 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
4031#elif ARCH_BITS == 64
4032 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
4033#else
4034# error "ARCH_BITS is bogus"
4035#endif
4036}
4037
4038
4039/**
4040 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
4041 *
4042 * @returns Current *pv value
4043 * @param ppv Pointer to the pointer variable to read.
4044 * @param Type The type of *ppv, sans volatile.
4045 */
4046#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4047# define ASMAtomicUoReadPtrT(ppv, Type) \
4048 __extension__ \
4049 ({\
4050 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4051 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
4052 pvTypeChecked; \
4053 })
4054#else
4055# define ASMAtomicUoReadPtrT(ppv, Type) \
4056 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4057#endif
4058
4059
4060/**
4061 * Atomically reads a boolean value, ordered.
4062 *
4063 * @returns Current *pf value
4064 * @param pf Pointer to the boolean variable to read.
4065 */
4066DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4067{
4068 ASMMemoryFence();
4069 return *pf; /* byte reads are atomic on x86 */
4070}
4071
4072
4073/**
4074 * Atomically reads a boolean value, unordered.
4075 *
4076 * @returns Current *pf value
4077 * @param pf Pointer to the boolean variable to read.
4078 */
4079DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4080{
4081 return *pf; /* byte reads are atomic on x86 */
4082}
4083
4084
4085/**
4086 * Atomically read a typical IPRT handle value, ordered.
4087 *
4088 * @param ph Pointer to the handle variable to read.
4089 * @param phRes Where to store the result.
4090 *
4091 * @remarks This doesn't currently work for all handles (like RTFILE).
4092 */
4093#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4094# define ASMAtomicReadHandle(ph, phRes) \
4095 do { \
4096 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4097 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4098 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
4099 } while (0)
4100#elif HC_ARCH_BITS == 64
4101# define ASMAtomicReadHandle(ph, phRes) \
4102 do { \
4103 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4104 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4105 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
4106 } while (0)
4107#else
4108# error HC_ARCH_BITS
4109#endif
4110
4111
4112/**
4113 * Atomically read a typical IPRT handle value, unordered.
4114 *
4115 * @param ph Pointer to the handle variable to read.
4116 * @param phRes Where to store the result.
4117 *
4118 * @remarks This doesn't currently work for all handles (like RTFILE).
4119 */
4120#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4121# define ASMAtomicUoReadHandle(ph, phRes) \
4122 do { \
4123 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4124 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4125 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
4126 } while (0)
4127#elif HC_ARCH_BITS == 64
4128# define ASMAtomicUoReadHandle(ph, phRes) \
4129 do { \
4130 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4131 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4132 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
4133 } while (0)
4134#else
4135# error HC_ARCH_BITS
4136#endif
4137
4138
4139/**
4140 * Atomically read a value which size might differ
4141 * between platforms or compilers, ordered.
4142 *
4143 * @param pu Pointer to the variable to read.
4144 * @param puRes Where to store the result.
4145 */
4146#define ASMAtomicReadSize(pu, puRes) \
4147 do { \
4148 switch (sizeof(*(pu))) { \
4149 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4150 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4151 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4152 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4153 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4154 } \
4155 } while (0)
4156
4157
4158/**
4159 * Atomically read a value which size might differ
4160 * between platforms or compilers, unordered.
4161 *
4162 * @param pu Pointer to the variable to read.
4163 * @param puRes Where to store the result.
4164 */
4165#define ASMAtomicUoReadSize(pu, puRes) \
4166 do { \
4167 switch (sizeof(*(pu))) { \
4168 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4169 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4170 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4171 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4172 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4173 } \
4174 } while (0)
4175
4176
4177/**
4178 * Atomically writes an unsigned 8-bit value, ordered.
4179 *
4180 * @param pu8 Pointer to the 8-bit variable.
4181 * @param u8 The 8-bit value to assign to *pu8.
4182 */
4183DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4184{
4185#if defined(RT_ARCH_ARM64)
4186 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
4187 as all byte accesses are single-copy atomic, which I think suffices here. */
4188# if RT_INLINE_ASM_USES_INTRIN
4189 __dmb(_ARM64_BARRIER_SY);
4190 __stlr8(pu8, u8);
4191# else
4192 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
4193# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
4194 RTASM_ARM_DMB_SY
4195 "swpb %w[uValue], wzr, %[pMem]\n\t"
4196# else
4197 RTASM_ARM_DMB_SY
4198 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4199# endif
4200 : [pMem] "+Q" (*pu8)
4201 : [uValue] "r" ((uint32_t)u8)
4202 : );
4203# endif
4204
4205#else
4206 ASMAtomicXchgU8(pu8, u8);
4207#endif
4208}
4209
4210
4211/**
4212 * Atomically writes an unsigned 8-bit value, unordered.
4213 *
4214 * @param pu8 Pointer to the 8-bit variable.
4215 * @param u8 The 8-bit value to assign to *pu8.
4216 */
4217DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4218{
4219 *pu8 = u8; /* byte writes are atomic on x86 */
4220}
4221
4222
4223/**
4224 * Atomically writes a signed 8-bit value, ordered.
4225 *
4226 * @param pi8 Pointer to the 8-bit variable to read.
4227 * @param i8 The 8-bit value to assign to *pi8.
4228 */
4229DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4230{
4231#if defined(RT_ARCH_ARM64)
4232 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
4233#else
4234 ASMAtomicXchgS8(pi8, i8);
4235#endif
4236}
4237
4238
4239/**
4240 * Atomically writes a signed 8-bit value, unordered.
4241 *
4242 * @param pi8 Pointer to the 8-bit variable to write.
4243 * @param i8 The 8-bit value to assign to *pi8.
4244 */
4245DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4246{
4247 *pi8 = i8; /* byte writes are atomic on x86 */
4248}
4249
4250
4251/**
4252 * Atomically writes an unsigned 16-bit value, ordered.
4253 *
4254 * @param pu16 Pointer to the 16-bit variable to write.
4255 * @param u16 The 16-bit value to assign to *pu16.
4256 */
4257DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4258{
4259#if defined(RT_ARCH_ARM64)
4260 /* See ASMAtomicWriteU8 comments. */
4261# if RT_INLINE_ASM_USES_INTRIN
4262 __dmb(_ARM64_BARRIER_SY);
4263 __stlr16(pu16, u16);
4264# else
4265 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
4266# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4267 RTASM_ARM_DMB_SY
4268 "swph %w[uValue], wzr, %[pMem]\n\t"
4269# else
4270 RTASM_ARM_DMB_SY
4271 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4272# endif
4273 : [pMem] "+Q" (*pu16)
4274 : [uValue] "r" ((uint32_t)u16)
4275 : );
4276# endif
4277
4278#else
4279 ASMAtomicXchgU16(pu16, u16);
4280#endif
4281}
4282
4283
4284/**
4285 * Atomically writes an unsigned 16-bit value, unordered.
4286 *
4287 * @param pu16 Pointer to the 16-bit variable to write.
4288 * @param u16 The 16-bit value to assign to *pu16.
4289 */
4290DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4291{
4292 Assert(!((uintptr_t)pu16 & 1));
4293 *pu16 = u16;
4294}
4295
4296
4297/**
4298 * Atomically writes a signed 16-bit value, ordered.
4299 *
4300 * @param pi16 Pointer to the 16-bit variable to write.
4301 * @param i16 The 16-bit value to assign to *pi16.
4302 */
4303DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4304{
4305#if defined(RT_ARCH_ARM64)
4306 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
4307#else
4308 ASMAtomicXchgS16(pi16, i16);
4309#endif
4310}
4311
4312
4313/**
4314 * Atomically writes a signed 16-bit value, unordered.
4315 *
4316 * @param pi16 Pointer to the 16-bit variable to write.
4317 * @param i16 The 16-bit value to assign to *pi16.
4318 */
4319DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4320{
4321 Assert(!((uintptr_t)pi16 & 1));
4322 *pi16 = i16;
4323}
4324
4325
4326/**
4327 * Atomically writes an unsigned 32-bit value, ordered.
4328 *
4329 * @param pu32 Pointer to the 32-bit variable to write.
4330 * @param u32 The 32-bit value to assign to *pu32.
4331 */
4332DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4333{
4334#if defined(RT_ARCH_ARM64)
4335 /* See ASMAtomicWriteU8 comments. */
4336# if RT_INLINE_ASM_USES_INTRIN
4337 __dmb(_ARM64_BARRIER_SY);
4338 __stlr32(pu32, u32);
4339# else
4340 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
4341# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4342 RTASM_ARM_DMB_SY
4343 "swp %w[uValue], wzr, %[pMem]\n\t"
4344# else
4345 RTASM_ARM_DMB_SY
4346 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4347# endif
4348 : [pMem] "+Q" (*pu32)
4349 : [uValue] "r" (u32)
4350 : "cc");
4351# endif
4352
4353#else
4354 ASMAtomicXchgU32(pu32, u32);
4355#endif
4356}
4357
4358
4359/**
4360 * Atomically writes an unsigned 32-bit value, unordered.
4361 *
4362 * @param pu32 Pointer to the 32-bit variable to write.
4363 * @param u32 The 32-bit value to assign to *pu32.
4364 */
4365DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4366{
4367 Assert(!((uintptr_t)pu32 & 3));
4368#if ARCH_BITS >= 32
4369 *pu32 = u32;
4370#else
4371 ASMAtomicXchgU32(pu32, u32);
4372#endif
4373}
4374
4375
4376/**
4377 * Atomically writes a signed 32-bit value, ordered.
4378 *
4379 * @param pi32 Pointer to the 32-bit variable to write.
4380 * @param i32 The 32-bit value to assign to *pi32.
4381 */
4382DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4383{
4384#if defined(RT_ARCH_ARM64)
4385 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
4386#else
4387 ASMAtomicXchgS32(pi32, i32);
4388#endif
4389}
4390
4391
4392/**
4393 * Atomically writes a signed 32-bit value, unordered.
4394 *
4395 * @param pi32 Pointer to the 32-bit variable to write.
4396 * @param i32 The 32-bit value to assign to *pi32.
4397 */
4398DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4399{
4400 Assert(!((uintptr_t)pi32 & 3));
4401#if ARCH_BITS >= 32
4402 *pi32 = i32;
4403#else
4404 ASMAtomicXchgS32(pi32, i32);
4405#endif
4406}
4407
4408
4409/**
4410 * Atomically writes an unsigned 64-bit value, ordered.
4411 *
4412 * @param pu64 Pointer to the 64-bit variable to write.
4413 * @param u64 The 64-bit value to assign to *pu64.
4414 */
4415DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4416{
4417#if defined(RT_ARCH_ARM64)
4418 /* See ASMAtomicWriteU8 comments. */
4419# if RT_INLINE_ASM_USES_INTRIN
4420 __dmb(_ARM64_BARRIER_SY);
4421 __stlr64(pu64, u64);
4422# else
4423 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
4424# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4425 RTASM_ARM_DMB_SY
4426 "swp %[uValue], xzr, %[pMem]\n\t"
4427# else
4428 RTASM_ARM_DMB_SY /** @todo necessary? */
4429 "stlr %[uValue], %[pMem]\n\t"
4430# endif
4431 : [pMem] "+Q" (*pu64)
4432 : [uValue] "r" (u64)
4433 : );
4434# endif
4435
4436#else
4437 ASMAtomicXchgU64(pu64, u64);
4438#endif
4439}
4440
4441
4442/**
4443 * Atomically writes an unsigned 64-bit value, unordered.
4444 *
4445 * @param pu64 Pointer to the 64-bit variable to write.
4446 * @param u64 The 64-bit value to assign to *pu64.
4447 */
4448DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4449{
4450 Assert(!((uintptr_t)pu64 & 7));
4451#if ARCH_BITS == 64
4452 *pu64 = u64;
4453#else
4454 ASMAtomicXchgU64(pu64, u64);
4455#endif
4456}
4457
4458
4459/**
4460 * Atomically writes a signed 64-bit value, ordered.
4461 *
4462 * @param pi64 Pointer to the 64-bit variable to write.
4463 * @param i64 The 64-bit value to assign to *pi64.
4464 */
4465DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4466{
4467#if defined(RT_ARCH_ARM64)
4468 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4469#else
4470 ASMAtomicXchgS64(pi64, i64);
4471#endif
4472}
4473
4474
4475/**
4476 * Atomically writes a signed 64-bit value, unordered.
4477 *
4478 * @param pi64 Pointer to the 64-bit variable to write.
4479 * @param i64 The 64-bit value to assign to *pi64.
4480 */
4481DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4482{
4483 Assert(!((uintptr_t)pi64 & 7));
4484#if ARCH_BITS == 64
4485 *pi64 = i64;
4486#else
4487 ASMAtomicXchgS64(pi64, i64);
4488#endif
4489}
4490
4491
4492/** @def RTASM_HAVE_WRITE_U128
4493 * Defined in the target architecture supports atomic of 128-bit integers.
4494 *
4495 * The define value is zero if both ordered and unordered writes are implemented
4496 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered writes are done
4497 * natively w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
4498 *
4499 * @note AMD64: Caller must check for cmpxchg16b support before use.
4500 * @sa RTASM_HAVE_CMP_XCHG_U128
4501 */
4502#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
4503# define RTASM_HAVE_WRITE_U128 3
4504#elif defined(RTASM_HAVE_CMP_XCHG_U128)
4505# define RTASM_HAVE_WRITE_U128 0
4506#endif
4507
4508#ifdef RTASM_HAVE_WRITE_U128
4509
4510/**
4511 * Atomically writes an unsigned 128-bit value, ordered.
4512 *
4513 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4514 * on 16 byte boundrary.
4515 * @param u64Hi The high 64 bits of the new value.
4516 * @param u64Lo The low 64 bits of the new value.
4517 */
4518DECLINLINE(void) ASMAtomicWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4519{
4520# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4521 RTUINT128U u128Old;
4522# endif
4523 Assert(!((uintptr_t)pu128 & 15));
4524# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4525 __asm__ __volatile__("Lstart_ASMAtomicWriteU128v2_%=:\n\t"
4526# if 0 && defined(RTASM_ARM64_USE_FEAT_LSE128) /** @todo hw support? test + debug */
4527 RTASM_ARM_DMB_SY
4528 "swpp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4529# else
4530 RTASM_ARM_DMB_SY
4531 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4532 "dmb sy\n\t"
4533# endif
4534 : [pMem] "+Q" (*pu128)
4535 : [uValueHi] "r" (u64Hi)
4536 , [uValueLo] "r" (u64Lo)
4537 : );
4538
4539# else
4540# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4541 u128Old.u = *pu128;
4542# else
4543 u128Old.u.Lo = pu128->Lo;
4544 u128Old.u.Hi = pu128->Hi;
4545# endif
4546 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4547 { }
4548# endif
4549}
4550
4551
4552/**
4553 * Atomically writes an unsigned 128-bit value, ordered.
4554 *
4555 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4556 * on 16 byte boundrary.
4557 * @param u64Hi The high 64 bits of the new value.
4558 * @param u64Lo The low 64 bits of the new value.
4559 * @note This is ordered on AMD64.
4560 */
4561DECLINLINE(void) ASMAtomicUoWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4562{
4563# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4564 RTUINT128U u128Old;
4565# endif
4566 Assert(!((uintptr_t)pu128 & 15));
4567# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4568 __asm__ __volatile__("Lstart_ASMAtomicUoWriteU128v2_%=:\n\t"
4569 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4570 : [pMem] "+Q" (*pu128)
4571 : [uValueHi] "r" (u64Hi)
4572 , [uValueLo] "r" (u64Lo)
4573 : );
4574
4575# else
4576# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4577 u128Old.u = *pu128;
4578# else
4579 u128Old.u.Lo = pu128->Lo;
4580 u128Old.u.Hi = pu128->Hi;
4581# endif
4582 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4583 { }
4584# endif
4585}
4586
4587
4588/**
4589 * Atomically writes an unsigned 128-bit value, ordered.
4590 *
4591 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4592 * on 16 byte boundrary.
4593 * @param u128 The the new value.
4594 */
4595DECLINLINE(void) ASMAtomicWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4596{
4597# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4598 ASMAtomicWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4599# else
4600 ASMAtomicWriteU128v2(pu128, u128.Hi, u128.Lo);
4601# endif
4602}
4603
4604
4605/**
4606 * Atomically writes an unsigned 128-bit value, unordered.
4607 *
4608 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4609 * on 16 byte boundrary.
4610 * @param u128 The the new value.
4611 * @note This is ordered on AMD64.
4612 */
4613DECLINLINE(void) ASMAtomicUoWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4614{
4615# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4616 ASMAtomicUoWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4617# else
4618 ASMAtomicUoWriteU128v2(pu128, u128.Hi, u128.Lo);
4619# endif
4620}
4621
4622
4623/**
4624 * Atomically writes an unsigned 128-bit value, ordered.
4625 *
4626 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4627 * on 16 byte boundrary.
4628 * @param u128 The the new value.
4629 */
4630DECLINLINE(void) ASMAtomicWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4631{
4632 ASMAtomicWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4633}
4634
4635
4636/**
4637 * Atomically writes an unsigned 128-bit value, unordered.
4638 *
4639 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4640 * on 16 byte boundrary.
4641 * @param u128 The the new value.
4642 * @note This is ordered on AMD64.
4643 */
4644DECLINLINE(void) ASMAtomicUoWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4645{
4646 ASMAtomicUoWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4647}
4648
4649#endif /* RTASM_HAVE_WRITE_U128 */
4650
4651/**
4652 * Atomically writes a size_t value, ordered.
4653 *
4654 * @param pcb Pointer to the size_t variable to write.
4655 * @param cb The value to assign to *pcb.
4656 */
4657DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4658{
4659#if ARCH_BITS == 64
4660 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4661#elif ARCH_BITS == 32
4662 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4663#elif ARCH_BITS == 16
4664 AssertCompileSize(size_t, 2);
4665 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4666#else
4667# error "Unsupported ARCH_BITS value"
4668#endif
4669}
4670
4671
4672/**
4673 * Atomically writes a size_t value, unordered.
4674 *
4675 * @param pcb Pointer to the size_t variable to write.
4676 * @param cb The value to assign to *pcb.
4677 */
4678DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4679{
4680#if ARCH_BITS == 64
4681 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4682#elif ARCH_BITS == 32
4683 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4684#elif ARCH_BITS == 16
4685 AssertCompileSize(size_t, 2);
4686 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4687#else
4688# error "Unsupported ARCH_BITS value"
4689#endif
4690}
4691
4692
4693/**
4694 * Atomically writes a boolean value, unordered.
4695 *
4696 * @param pf Pointer to the boolean variable to write.
4697 * @param f The boolean value to assign to *pf.
4698 */
4699DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4700{
4701 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4702}
4703
4704
4705/**
4706 * Atomically writes a boolean value, unordered.
4707 *
4708 * @param pf Pointer to the boolean variable to write.
4709 * @param f The boolean value to assign to *pf.
4710 */
4711DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4712{
4713 *pf = f; /* byte writes are atomic on x86 */
4714}
4715
4716
4717/**
4718 * Atomically writes a pointer value, ordered.
4719 *
4720 * @param ppv Pointer to the pointer variable to write.
4721 * @param pv The pointer value to assign to *ppv.
4722 */
4723DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4724{
4725#if ARCH_BITS == 32 || ARCH_BITS == 16
4726 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4727#elif ARCH_BITS == 64
4728 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4729#else
4730# error "ARCH_BITS is bogus"
4731#endif
4732}
4733
4734
4735/**
4736 * Atomically writes a pointer value, unordered.
4737 *
4738 * @param ppv Pointer to the pointer variable to write.
4739 * @param pv The pointer value to assign to *ppv.
4740 */
4741DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4742{
4743#if ARCH_BITS == 32 || ARCH_BITS == 16
4744 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4745#elif ARCH_BITS == 64
4746 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4747#else
4748# error "ARCH_BITS is bogus"
4749#endif
4750}
4751
4752
4753/**
4754 * Atomically writes a pointer value, ordered.
4755 *
4756 * @param ppv Pointer to the pointer variable to write.
4757 * @param pv The pointer value to assign to *ppv. If NULL use
4758 * ASMAtomicWriteNullPtr or you'll land in trouble.
4759 *
4760 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4761 * NULL.
4762 */
4763#ifdef __GNUC__
4764# define ASMAtomicWritePtr(ppv, pv) \
4765 do \
4766 { \
4767 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4768 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4769 \
4770 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4771 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4772 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4773 \
4774 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4775 } while (0)
4776#else
4777# define ASMAtomicWritePtr(ppv, pv) \
4778 do \
4779 { \
4780 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4781 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4782 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4783 \
4784 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4785 } while (0)
4786#endif
4787
4788
4789/**
4790 * Atomically sets a pointer to NULL, ordered.
4791 *
4792 * @param ppv Pointer to the pointer variable that should be set to NULL.
4793 *
4794 * @remarks This is relatively type safe on GCC platforms.
4795 */
4796#if RT_GNUC_PREREQ(4, 2)
4797# define ASMAtomicWriteNullPtr(ppv) \
4798 do \
4799 { \
4800 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4801 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4802 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4803 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4804 } while (0)
4805#else
4806# define ASMAtomicWriteNullPtr(ppv) \
4807 do \
4808 { \
4809 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4810 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4811 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4812 } while (0)
4813#endif
4814
4815
4816/**
4817 * Atomically writes a pointer value, unordered.
4818 *
4819 * @returns Current *pv value
4820 * @param ppv Pointer to the pointer variable.
4821 * @param pv The pointer value to assign to *ppv. If NULL use
4822 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4823 *
4824 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4825 * NULL.
4826 */
4827#if RT_GNUC_PREREQ(4, 2)
4828# define ASMAtomicUoWritePtr(ppv, pv) \
4829 do \
4830 { \
4831 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4832 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4833 \
4834 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4835 AssertCompile(sizeof(pv) == sizeof(void *)); \
4836 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4837 \
4838 *(ppvTypeChecked) = pvTypeChecked; \
4839 } while (0)
4840#else
4841# define ASMAtomicUoWritePtr(ppv, pv) \
4842 do \
4843 { \
4844 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4845 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4846 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4847 *(ppv) = pv; \
4848 } while (0)
4849#endif
4850
4851
4852/**
4853 * Atomically sets a pointer to NULL, unordered.
4854 *
4855 * @param ppv Pointer to the pointer variable that should be set to NULL.
4856 *
4857 * @remarks This is relatively type safe on GCC platforms.
4858 */
4859#ifdef __GNUC__
4860# define ASMAtomicUoWriteNullPtr(ppv) \
4861 do \
4862 { \
4863 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4864 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4865 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4866 *(ppvTypeChecked) = NULL; \
4867 } while (0)
4868#else
4869# define ASMAtomicUoWriteNullPtr(ppv) \
4870 do \
4871 { \
4872 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4873 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4874 *(ppv) = NULL; \
4875 } while (0)
4876#endif
4877
4878
4879/**
4880 * Atomically write a typical IPRT handle value, ordered.
4881 *
4882 * @param ph Pointer to the variable to update.
4883 * @param hNew The value to assign to *ph.
4884 *
4885 * @remarks This doesn't currently work for all handles (like RTFILE).
4886 */
4887#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4888# define ASMAtomicWriteHandle(ph, hNew) \
4889 do { \
4890 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4891 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4892 } while (0)
4893#elif HC_ARCH_BITS == 64
4894# define ASMAtomicWriteHandle(ph, hNew) \
4895 do { \
4896 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4897 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4898 } while (0)
4899#else
4900# error HC_ARCH_BITS
4901#endif
4902
4903
4904/**
4905 * Atomically write a typical IPRT handle value, unordered.
4906 *
4907 * @param ph Pointer to the variable to update.
4908 * @param hNew The value to assign to *ph.
4909 *
4910 * @remarks This doesn't currently work for all handles (like RTFILE).
4911 */
4912#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4913# define ASMAtomicUoWriteHandle(ph, hNew) \
4914 do { \
4915 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4916 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4917 } while (0)
4918#elif HC_ARCH_BITS == 64
4919# define ASMAtomicUoWriteHandle(ph, hNew) \
4920 do { \
4921 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4922 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4923 } while (0)
4924#else
4925# error HC_ARCH_BITS
4926#endif
4927
4928
4929/**
4930 * Atomically write a value which size might differ
4931 * between platforms or compilers, ordered.
4932 *
4933 * @param pu Pointer to the variable to update.
4934 * @param uNew The value to assign to *pu.
4935 */
4936#define ASMAtomicWriteSize(pu, uNew) \
4937 do { \
4938 switch (sizeof(*(pu))) { \
4939 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4940 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4941 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4942 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4943 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4944 } \
4945 } while (0)
4946
4947/**
4948 * Atomically write a value which size might differ
4949 * between platforms or compilers, unordered.
4950 *
4951 * @param pu Pointer to the variable to update.
4952 * @param uNew The value to assign to *pu.
4953 */
4954#define ASMAtomicUoWriteSize(pu, uNew) \
4955 do { \
4956 switch (sizeof(*(pu))) { \
4957 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4958 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4959 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4960 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4961 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4962 } \
4963 } while (0)
4964
4965
4966
4967/**
4968 * Atomically exchanges and adds to a 16-bit value, ordered.
4969 *
4970 * @returns The old value.
4971 * @param pu16 Pointer to the value.
4972 * @param u16 Number to add.
4973 *
4974 * @remarks Currently not implemented, just to make 16-bit code happy.
4975 * @remarks x86: Requires a 486 or later.
4976 */
4977RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4978
4979
4980/**
4981 * Atomically exchanges and adds to a 32-bit value, ordered.
4982 *
4983 * @returns The old value.
4984 * @param pu32 Pointer to the value.
4985 * @param u32 Number to add.
4986 *
4987 * @remarks x86: Requires a 486 or later.
4988 */
4989#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4990RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4991#else
4992DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4993{
4994# if RT_INLINE_ASM_USES_INTRIN
4995 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4996 return u32;
4997
4998# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4999# if RT_INLINE_ASM_GNU_STYLE
5000 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5001 : "=r" (u32)
5002 , "=m" (*pu32)
5003 : "0" (u32)
5004 , "m" (*pu32)
5005 : "memory"
5006 , "cc");
5007 return u32;
5008# else
5009 __asm
5010 {
5011 mov eax, [u32]
5012# ifdef RT_ARCH_AMD64
5013 mov rdx, [pu32]
5014 lock xadd [rdx], eax
5015# else
5016 mov edx, [pu32]
5017 lock xadd [edx], eax
5018# endif
5019 mov [u32], eax
5020 }
5021 return u32;
5022# endif
5023
5024# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5025 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
5026# if defined(RTASM_ARM64_USE_FEAT_LSE)
5027 uint32_t u32OldRet;
5028 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
5029# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5030 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5031# else
5032 RTASM_ARM_DMB_SY
5033 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5034# endif
5035 : [pMem] "+Q" (*pu32)
5036 , [uOldActual] "=&r" (u32OldRet)
5037 : [uAddend] "r" (u32)
5038 : );
5039# else
5040 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
5041 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
5042 "add %[uNew], %[uOld], %[uVal]\n\t",
5043 [uVal] "r" (u32));
5044# endif
5045 return u32OldRet;
5046
5047# else
5048# error "Port me"
5049# endif
5050}
5051#endif
5052
5053
5054/**
5055 * Atomically exchanges and adds to a signed 32-bit value, ordered.
5056 *
5057 * @returns The old value.
5058 * @param pi32 Pointer to the value.
5059 * @param i32 Number to add.
5060 *
5061 * @remarks x86: Requires a 486 or later.
5062 */
5063DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5064{
5065 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5066}
5067
5068
5069/**
5070 * Atomically exchanges and adds to a 64-bit value, ordered.
5071 *
5072 * @returns The old value.
5073 * @param pu64 Pointer to the value.
5074 * @param u64 Number to add.
5075 *
5076 * @remarks x86: Requires a Pentium or later.
5077 */
5078#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5079DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5080#else
5081DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5082{
5083# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5084 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
5085 return u64;
5086
5087# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5088 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5089 : "=r" (u64)
5090 , "=m" (*pu64)
5091 : "0" (u64)
5092 , "m" (*pu64)
5093 : "memory"
5094 , "cc");
5095 return u64;
5096
5097# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5098# if defined(RTASM_ARM64_USE_FEAT_LSE)
5099 uint64_t u64OldRet;
5100 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
5101# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5102 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
5103# else
5104 RTASM_ARM_DMB_SY
5105 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
5106# endif
5107 : [pMem] "+Q" (*pu64)
5108 , [uOldActual] "=&r" (u64OldRet)
5109 : [uAddend] "r" (u64)
5110 : );
5111# else
5112 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
5113 "add %[uNew], %[uOld], %[uVal]\n\t"
5114 ,
5115 "add %[uNew], %[uOld], %[uVal]\n\t"
5116 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
5117 [uVal] "r" (u64));
5118# endif
5119 return u64OldRet;
5120
5121# else
5122 uint64_t u64Old;
5123 for (;;)
5124 {
5125 uint64_t u64New;
5126 u64Old = ASMAtomicUoReadU64(pu64);
5127 u64New = u64Old + u64;
5128 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5129 break;
5130 ASMNopPause();
5131 }
5132 return u64Old;
5133# endif
5134}
5135#endif
5136
5137
5138/**
5139 * Atomically exchanges and adds to a signed 64-bit value, ordered.
5140 *
5141 * @returns The old value.
5142 * @param pi64 Pointer to the value.
5143 * @param i64 Number to add.
5144 *
5145 * @remarks x86: Requires a Pentium or later.
5146 */
5147DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5148{
5149 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5150}
5151
5152
5153/**
5154 * Atomically exchanges and adds to a size_t value, ordered.
5155 *
5156 * @returns The old value.
5157 * @param pcb Pointer to the size_t value.
5158 * @param cb Number to add.
5159 */
5160DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5161{
5162#if ARCH_BITS == 64
5163 AssertCompileSize(size_t, 8);
5164 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
5165#elif ARCH_BITS == 32
5166 AssertCompileSize(size_t, 4);
5167 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
5168#elif ARCH_BITS == 16
5169 AssertCompileSize(size_t, 2);
5170 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
5171#else
5172# error "Unsupported ARCH_BITS value"
5173#endif
5174}
5175
5176
5177/**
5178 * Atomically exchanges and adds a value which size might differ between
5179 * platforms or compilers, ordered.
5180 *
5181 * @param pu Pointer to the variable to update.
5182 * @param uNew The value to add to *pu.
5183 * @param puOld Where to store the old value.
5184 */
5185#define ASMAtomicAddSize(pu, uNew, puOld) \
5186 do { \
5187 switch (sizeof(*(pu))) { \
5188 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5189 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5190 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
5191 } \
5192 } while (0)
5193
5194
5195
5196/**
5197 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
5198 *
5199 * @returns The old value.
5200 * @param pu16 Pointer to the value.
5201 * @param u16 Number to subtract.
5202 *
5203 * @remarks x86: Requires a 486 or later.
5204 */
5205DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
5206{
5207 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
5208}
5209
5210
5211/**
5212 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
5213 *
5214 * @returns The old value.
5215 * @param pi16 Pointer to the value.
5216 * @param i16 Number to subtract.
5217 *
5218 * @remarks x86: Requires a 486 or later.
5219 */
5220DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
5221{
5222 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
5223}
5224
5225
5226/**
5227 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
5228 *
5229 * @returns The old value.
5230 * @param pu32 Pointer to the value.
5231 * @param u32 Number to subtract.
5232 *
5233 * @remarks x86: Requires a 486 or later.
5234 */
5235DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5236{
5237 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
5238}
5239
5240
5241/**
5242 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
5243 *
5244 * @returns The old value.
5245 * @param pi32 Pointer to the value.
5246 * @param i32 Number to subtract.
5247 *
5248 * @remarks x86: Requires a 486 or later.
5249 */
5250DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5251{
5252 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
5253}
5254
5255
5256/**
5257 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
5258 *
5259 * @returns The old value.
5260 * @param pu64 Pointer to the value.
5261 * @param u64 Number to subtract.
5262 *
5263 * @remarks x86: Requires a Pentium or later.
5264 */
5265DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5266{
5267 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
5268}
5269
5270
5271/**
5272 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
5273 *
5274 * @returns The old value.
5275 * @param pi64 Pointer to the value.
5276 * @param i64 Number to subtract.
5277 *
5278 * @remarks x86: Requires a Pentium or later.
5279 */
5280DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5281{
5282 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
5283}
5284
5285
5286/**
5287 * Atomically exchanges and subtracts to a size_t value, ordered.
5288 *
5289 * @returns The old value.
5290 * @param pcb Pointer to the size_t value.
5291 * @param cb Number to subtract.
5292 *
5293 * @remarks x86: Requires a 486 or later.
5294 */
5295DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5296{
5297#if ARCH_BITS == 64
5298 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
5299#elif ARCH_BITS == 32
5300 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
5301#elif ARCH_BITS == 16
5302 AssertCompileSize(size_t, 2);
5303 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
5304#else
5305# error "Unsupported ARCH_BITS value"
5306#endif
5307}
5308
5309
5310/**
5311 * Atomically exchanges and subtracts a value which size might differ between
5312 * platforms or compilers, ordered.
5313 *
5314 * @param pu Pointer to the variable to update.
5315 * @param uNew The value to subtract to *pu.
5316 * @param puOld Where to store the old value.
5317 *
5318 * @remarks x86: Requires a 486 or later.
5319 */
5320#define ASMAtomicSubSize(pu, uNew, puOld) \
5321 do { \
5322 switch (sizeof(*(pu))) { \
5323 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5324 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5325 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
5326 } \
5327 } while (0)
5328
5329
5330
5331/**
5332 * Atomically increment a 16-bit value, ordered.
5333 *
5334 * @returns The new value.
5335 * @param pu16 Pointer to the value to increment.
5336 * @remarks Not implemented. Just to make 16-bit code happy.
5337 *
5338 * @remarks x86: Requires a 486 or later.
5339 */
5340RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5341
5342
5343/**
5344 * Atomically increment a 32-bit value, ordered.
5345 *
5346 * @returns The new value.
5347 * @param pu32 Pointer to the value to increment.
5348 *
5349 * @remarks x86: Requires a 486 or later.
5350 */
5351#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5352RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5353#else
5354DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5355{
5356# if RT_INLINE_ASM_USES_INTRIN
5357 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
5358
5359# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5360# if RT_INLINE_ASM_GNU_STYLE
5361 uint32_t u32;
5362 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5363 : "=r" (u32)
5364 , "=m" (*pu32)
5365 : "0" (1)
5366 , "m" (*pu32)
5367 : "memory"
5368 , "cc");
5369 return u32+1;
5370# else
5371 __asm
5372 {
5373 mov eax, 1
5374# ifdef RT_ARCH_AMD64
5375 mov rdx, [pu32]
5376 lock xadd [rdx], eax
5377# else
5378 mov edx, [pu32]
5379 lock xadd [edx], eax
5380# endif
5381 mov u32, eax
5382 }
5383 return u32+1;
5384# endif
5385
5386# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5387 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
5388# if defined(RTASM_ARM64_USE_FEAT_LSE)
5389 uint32_t u32NewRet;
5390 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
5391# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5392 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5393# else
5394 RTASM_ARM_DMB_SY
5395 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5396# endif
5397 "add %w[uNewRet], %w[uNewRet], #1\n\t"
5398 : [pMem] "+Q" (*pu32)
5399 , [uNewRet] "=&r" (u32NewRet)
5400 : [uAddend] "r" ((uint32_t)1)
5401 : );
5402# else
5403 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
5404 "add %w[uNew], %w[uNew], #1\n\t",
5405 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5406 "X" (0) /* dummy */);
5407# endif
5408 return u32NewRet;
5409
5410# else
5411 return ASMAtomicAddU32(pu32, 1) + 1;
5412# endif
5413}
5414#endif
5415
5416
5417/**
5418 * Atomically increment a signed 32-bit value, ordered.
5419 *
5420 * @returns The new value.
5421 * @param pi32 Pointer to the value to increment.
5422 *
5423 * @remarks x86: Requires a 486 or later.
5424 */
5425DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5426{
5427 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
5428}
5429
5430
5431/**
5432 * Atomically increment a 64-bit value, ordered.
5433 *
5434 * @returns The new value.
5435 * @param pu64 Pointer to the value to increment.
5436 *
5437 * @remarks x86: Requires a Pentium or later.
5438 */
5439#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5440DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5441#else
5442DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5443{
5444# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5445 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
5446
5447# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5448 uint64_t u64;
5449 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5450 : "=r" (u64)
5451 , "=m" (*pu64)
5452 : "0" (1)
5453 , "m" (*pu64)
5454 : "memory"
5455 , "cc");
5456 return u64 + 1;
5457
5458# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5459# if defined(RTASM_ARM64_USE_FEAT_LSE)
5460 uint64_t u64NewRet;
5461 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
5462# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5463 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5464# else
5465 RTASM_ARM_DMB_SY
5466 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5467# endif
5468 "add %[uNewRet], %[uNewRet], #1\n\t"
5469 : [pMem] "+Q" (*pu64)
5470 , [uNewRet] "=&r" (u64NewRet)
5471 : [uAddend] "r" ((uint64_t)1)
5472 : );
5473# else
5474 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
5475 "add %[uNew], %[uNew], #1\n\t"
5476 ,
5477 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5478 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5479 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5480# endif
5481 return u64NewRet;
5482
5483# else
5484 return ASMAtomicAddU64(pu64, 1) + 1;
5485# endif
5486}
5487#endif
5488
5489
5490/**
5491 * Atomically increment a signed 64-bit value, ordered.
5492 *
5493 * @returns The new value.
5494 * @param pi64 Pointer to the value to increment.
5495 *
5496 * @remarks x86: Requires a Pentium or later.
5497 */
5498DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5499{
5500 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
5501}
5502
5503
5504/**
5505 * Atomically increment a size_t value, ordered.
5506 *
5507 * @returns The new value.
5508 * @param pcb Pointer to the value to increment.
5509 *
5510 * @remarks x86: Requires a 486 or later.
5511 */
5512DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5513{
5514#if ARCH_BITS == 64
5515 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
5516#elif ARCH_BITS == 32
5517 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
5518#elif ARCH_BITS == 16
5519 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
5520#else
5521# error "Unsupported ARCH_BITS value"
5522#endif
5523}
5524
5525
5526
5527/**
5528 * Atomically decrement an unsigned 32-bit value, ordered.
5529 *
5530 * @returns The new value.
5531 * @param pu16 Pointer to the value to decrement.
5532 * @remarks Not implemented. Just to make 16-bit code happy.
5533 *
5534 * @remarks x86: Requires a 486 or later.
5535 */
5536RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5537
5538
5539/**
5540 * Atomically decrement an unsigned 32-bit value, ordered.
5541 *
5542 * @returns The new value.
5543 * @param pu32 Pointer to the value to decrement.
5544 *
5545 * @remarks x86: Requires a 486 or later.
5546 */
5547#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5548RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5549#else
5550DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5551{
5552# if RT_INLINE_ASM_USES_INTRIN
5553 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
5554
5555# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5556# if RT_INLINE_ASM_GNU_STYLE
5557 uint32_t u32;
5558 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5559 : "=r" (u32)
5560 , "=m" (*pu32)
5561 : "0" (-1)
5562 , "m" (*pu32)
5563 : "memory"
5564 , "cc");
5565 return u32-1;
5566# else
5567 uint32_t u32;
5568 __asm
5569 {
5570 mov eax, -1
5571# ifdef RT_ARCH_AMD64
5572 mov rdx, [pu32]
5573 lock xadd [rdx], eax
5574# else
5575 mov edx, [pu32]
5576 lock xadd [edx], eax
5577# endif
5578 mov u32, eax
5579 }
5580 return u32-1;
5581# endif
5582
5583# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5584 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
5585# if defined(RTASM_ARM64_USE_FEAT_LSE)
5586 uint32_t u32NewRet;
5587 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
5588# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5589 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5590# else
5591 RTASM_ARM_DMB_SY
5592 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5593# endif
5594 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
5595 : [pMem] "+Q" (*pu32)
5596 , [uNewRet] "=&r" (u32NewRet)
5597 : [uAddend] "r" (~(uint32_t)0)
5598 : );
5599# else
5600 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5601 "sub %w[uNew], %w[uNew], #1\n\t",
5602 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5603 "X" (0) /* dummy */);
5604# endif
5605 return u32NewRet;
5606
5607# else
5608 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5609# endif
5610}
5611#endif
5612
5613
5614/**
5615 * Atomically decrement a signed 32-bit value, ordered.
5616 *
5617 * @returns The new value.
5618 * @param pi32 Pointer to the value to decrement.
5619 *
5620 * @remarks x86: Requires a 486 or later.
5621 */
5622DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5623{
5624 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5625}
5626
5627
5628/**
5629 * Atomically decrement an unsigned 64-bit value, ordered.
5630 *
5631 * @returns The new value.
5632 * @param pu64 Pointer to the value to decrement.
5633 *
5634 * @remarks x86: Requires a Pentium or later.
5635 */
5636#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5637RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5638#else
5639DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5640{
5641# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5642 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5643
5644# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5645 uint64_t u64;
5646 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5647 : "=r" (u64)
5648 , "=m" (*pu64)
5649 : "0" (~(uint64_t)0)
5650 , "m" (*pu64)
5651 : "memory"
5652 , "cc");
5653 return u64-1;
5654
5655# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5656# if defined(RTASM_ARM64_USE_FEAT_LSE)
5657 uint64_t u64NewRet;
5658 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5659# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5660 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5661# else
5662 RTASM_ARM_DMB_SY
5663 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5664# endif
5665 "sub %[uNewRet], %[uNewRet], #1\n\t"
5666 : [pMem] "+Q" (*pu64)
5667 , [uNewRet] "=&r" (u64NewRet)
5668 : [uAddend] "r" (~(uint64_t)0)
5669 : );
5670# else
5671 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5672 "sub %[uNew], %[uNew], #1\n\t"
5673 ,
5674 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5675 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5676 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5677# endif
5678 return u64NewRet;
5679
5680# else
5681 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5682# endif
5683}
5684#endif
5685
5686
5687/**
5688 * Atomically decrement a signed 64-bit value, ordered.
5689 *
5690 * @returns The new value.
5691 * @param pi64 Pointer to the value to decrement.
5692 *
5693 * @remarks x86: Requires a Pentium or later.
5694 */
5695DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5696{
5697 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5698}
5699
5700
5701/**
5702 * Atomically decrement a size_t value, ordered.
5703 *
5704 * @returns The new value.
5705 * @param pcb Pointer to the value to decrement.
5706 *
5707 * @remarks x86: Requires a 486 or later.
5708 */
5709DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5710{
5711#if ARCH_BITS == 64
5712 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5713#elif ARCH_BITS == 32
5714 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5715#elif ARCH_BITS == 16
5716 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5717#else
5718# error "Unsupported ARCH_BITS value"
5719#endif
5720}
5721
5722
5723/**
5724 * Atomically Or an unsigned 32-bit value, ordered.
5725 *
5726 * @param pu32 Pointer to the pointer variable to OR u32 with.
5727 * @param u32 The value to OR *pu32 with.
5728 *
5729 * @remarks x86: Requires a 386 or later.
5730 */
5731#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5732RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5733#else
5734DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5735{
5736# if RT_INLINE_ASM_USES_INTRIN
5737 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5738
5739# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5740# if RT_INLINE_ASM_GNU_STYLE
5741 __asm__ __volatile__("lock; orl %1, %0\n\t"
5742 : "=m" (*pu32)
5743 : "ir" (u32)
5744 , "m" (*pu32)
5745 : "cc");
5746# else
5747 __asm
5748 {
5749 mov eax, [u32]
5750# ifdef RT_ARCH_AMD64
5751 mov rdx, [pu32]
5752 lock or [rdx], eax
5753# else
5754 mov edx, [pu32]
5755 lock or [edx], eax
5756# endif
5757 }
5758# endif
5759
5760# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5761# if defined(RTASM_ARM64_USE_FEAT_LSE)
5762# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5763 uint32_t u32Spill;
5764 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5765 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5766 : [pMem] "+Q" (*pu32)
5767 , [uSpill] "=&r" (u32Spill)
5768 : [fBitsToSet] "r" (u32)
5769 : );
5770# else
5771 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5772 RTASM_ARM_DMB_SY
5773 "stset %w[fBitsToSet], %[pMem]\n\t"
5774 : [pMem] "+Q" (*pu32)
5775 : [fBitsToSet] "r" (u32)
5776 : );
5777# endif
5778# else
5779 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5780 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5781 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5782 "orr %[uNew], %[uNew], %[uVal]\n\t",
5783 [uVal] "r" (u32));
5784
5785# endif
5786# else
5787# error "Port me"
5788# endif
5789}
5790#endif
5791
5792
5793/**
5794 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5795 * fallback).
5796 *
5797 * @returns Old value.
5798 * @param pu32 Pointer to the variable to OR @a u32 with.
5799 * @param u32 The value to OR @a *pu32 with.
5800 */
5801DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5802{
5803#if RT_INLINE_ASM_USES_INTRIN
5804 return (uint32_t)_InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5805
5806#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5807# if defined(RTASM_ARM64_USE_FEAT_LSE)
5808 uint32_t u32OldRet;
5809 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5810# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5811 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5812# else
5813 RTASM_ARM_DMB_SY
5814 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5815# endif
5816 : [pMem] "+Q" (*pu32)
5817 , [uOldRet] "=&r" (u32OldRet)
5818 : [fBitsToSet] "r" (u32)
5819 : );
5820# else
5821 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5822 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5823 "orr %[uNew], %[uOld], %[uVal]\n\t",
5824 [uVal] "r" (u32));
5825# endif
5826 return u32OldRet;
5827
5828#else
5829 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5830 uint32_t u32New;
5831 do
5832 u32New = u32RetOld | u32;
5833 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5834 return u32RetOld;
5835#endif
5836}
5837
5838
5839/**
5840 * Atomically Or a signed 32-bit value, ordered.
5841 *
5842 * @param pi32 Pointer to the pointer variable to OR u32 with.
5843 * @param i32 The value to OR *pu32 with.
5844 *
5845 * @remarks x86: Requires a 386 or later.
5846 */
5847DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5848{
5849 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5850}
5851
5852
5853/**
5854 * Atomically Or an unsigned 64-bit value, ordered.
5855 *
5856 * @param pu64 Pointer to the pointer variable to OR u64 with.
5857 * @param u64 The value to OR *pu64 with.
5858 *
5859 * @remarks x86: Requires a Pentium or later.
5860 */
5861#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5862DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5863#else
5864DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5865{
5866# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5867 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5868
5869# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5870 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5871 : "=m" (*pu64)
5872 : "r" (u64)
5873 , "m" (*pu64)
5874 : "cc");
5875
5876# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5877# if defined(RTASM_ARM64_USE_FEAT_LSE)
5878# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5879 uint64_t u64Spill;
5880 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5881 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5882 : [pMem] "+Q" (*pu64)
5883 , [uSpill] "=&r" (u64Spill)
5884 : [fBitsToSet] "r" (u64)
5885 : );
5886# else
5887 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5888 RTASM_ARM_DMB_SY
5889 "stset %[fBitsToSet], %[pMem]\n\t"
5890 : [pMem] "+Q" (*pu64)
5891 : [fBitsToSet] "r" (u64)
5892 : );
5893# endif
5894# else
5895 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5896 "orr %[uNew], %[uNew], %[uVal]\n\t"
5897 ,
5898 "orr %[uNew], %[uNew], %[uVal]\n\t"
5899 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5900 [uVal] "r" (u64));
5901# endif
5902
5903# else
5904 for (;;)
5905 {
5906 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5907 uint64_t u64New = u64Old | u64;
5908 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5909 break;
5910 ASMNopPause();
5911 }
5912# endif
5913}
5914#endif
5915
5916
5917/**
5918 * Atomically Or a signed 64-bit value, ordered.
5919 *
5920 * @param pi64 Pointer to the pointer variable to OR u64 with.
5921 * @param i64 The value to OR *pu64 with.
5922 *
5923 * @remarks x86: Requires a Pentium or later.
5924 */
5925DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5926{
5927 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5928}
5929
5930
5931/**
5932 * Atomically And an unsigned 32-bit value, ordered.
5933 *
5934 * @param pu32 Pointer to the pointer variable to AND u32 with.
5935 * @param u32 The value to AND *pu32 with.
5936 *
5937 * @remarks x86: Requires a 386 or later.
5938 */
5939#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5940RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5941#else
5942DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5943{
5944# if RT_INLINE_ASM_USES_INTRIN
5945 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5946
5947# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5948# if RT_INLINE_ASM_GNU_STYLE
5949 __asm__ __volatile__("lock; andl %1, %0\n\t"
5950 : "=m" (*pu32)
5951 : "ir" (u32)
5952 , "m" (*pu32)
5953 : "cc");
5954# else
5955 __asm
5956 {
5957 mov eax, [u32]
5958# ifdef RT_ARCH_AMD64
5959 mov rdx, [pu32]
5960 lock and [rdx], eax
5961# else
5962 mov edx, [pu32]
5963 lock and [edx], eax
5964# endif
5965 }
5966# endif
5967
5968# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5969# if defined(RTASM_ARM64_USE_FEAT_LSE)
5970# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5971 uint32_t u32Spill;
5972 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5973 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5974 : [pMem] "+Q" (*pu32)
5975 , [uSpill] "=&r" (u32Spill)
5976 : [fBitsToClear] "r" (~u32)
5977 : );
5978# else
5979 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5980 RTASM_ARM_DMB_SY
5981 "stclr %w[fBitsToClear], %[pMem]\n\t"
5982 : [pMem] "+Q" (*pu32)
5983 : [fBitsToClear] "r" (~u32)
5984 : );
5985# endif
5986# else
5987 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5988 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5989 "and %[uNew], %[uNew], %[uVal]\n\t",
5990 [uVal] "r" (u32));
5991
5992# endif
5993# else
5994# error "Port me"
5995# endif
5996}
5997#endif
5998
5999
6000/**
6001 * Atomically AND an unsigned 32-bit value, ordered, extended version.
6002 *
6003 * @returns Old value.
6004 * @param pu32 Pointer to the variable to AND @a u32 with.
6005 * @param u32 The value to AND @a *pu32 with.
6006 */
6007DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6008{
6009#if RT_INLINE_ASM_USES_INTRIN
6010 return (uint32_t)_InterlockedAnd((long volatile RT_FAR *)pu32, (long)u32);
6011
6012#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6013# if defined(RTASM_ARM64_USE_FEAT_LSE)
6014 uint32_t u32OldRet;
6015 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6016# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6017 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6018# else
6019 RTASM_ARM_DMB_SY
6020 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6021# endif
6022 : [pMem] "+Q" (*pu32)
6023 , [uOldRet] "=&r" (u32OldRet)
6024 : [fBitsToClear] "r" (~u32)
6025 : );
6026# else
6027 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
6028 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6029 "and %[uNew], %[uOld], %[uVal]\n\t",
6030 [uVal] "r" (u32));
6031# endif
6032 return u32OldRet;
6033
6034#else
6035 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6036 uint32_t u32New;
6037 do
6038 u32New = u32RetOld & u32;
6039 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6040 return u32RetOld;
6041#endif
6042}
6043
6044
6045/**
6046 * Atomically And a signed 32-bit value, ordered.
6047 *
6048 * @param pi32 Pointer to the pointer variable to AND i32 with.
6049 * @param i32 The value to AND *pi32 with.
6050 *
6051 * @remarks x86: Requires a 386 or later.
6052 */
6053DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6054{
6055 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6056}
6057
6058
6059/**
6060 * Atomically And an unsigned 64-bit value, ordered.
6061 *
6062 * @param pu64 Pointer to the pointer variable to AND u64 with.
6063 * @param u64 The value to AND *pu64 with.
6064 *
6065 * @remarks x86: Requires a Pentium or later.
6066 */
6067#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6068DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6069#else
6070DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6071{
6072# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
6073 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
6074
6075# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6076 __asm__ __volatile__("lock; andq %1, %0\n\t"
6077 : "=m" (*pu64)
6078 : "r" (u64)
6079 , "m" (*pu64)
6080 : "cc");
6081
6082# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6083# if defined(RTASM_ARM64_USE_FEAT_LSE)
6084# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6085 uint64_t u64Spill;
6086 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6087 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
6088 : [pMem] "+Q" (*pu64)
6089 , [uSpill] "=&r" (u64Spill)
6090 : [fBitsToClear] "r" (~u64)
6091 : );
6092# else
6093 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6094 RTASM_ARM_DMB_SY
6095 "stclr %[fBitsToClear], %[pMem]\n\t"
6096 : [pMem] "+Q" (*pu64)
6097 : [fBitsToClear] "r" (~u64)
6098 : );
6099# endif
6100# else
6101 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
6102 "and %[uNew], %[uNew], %[uVal]\n\t"
6103 ,
6104 "and %[uNew], %[uNew], %[uVal]\n\t"
6105 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6106 [uVal] "r" (u64));
6107# endif
6108
6109# else
6110 for (;;)
6111 {
6112 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6113 uint64_t u64New = u64Old & u64;
6114 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6115 break;
6116 ASMNopPause();
6117 }
6118# endif
6119}
6120#endif
6121
6122
6123/**
6124 * Atomically And a signed 64-bit value, ordered.
6125 *
6126 * @param pi64 Pointer to the pointer variable to AND i64 with.
6127 * @param i64 The value to AND *pi64 with.
6128 *
6129 * @remarks x86: Requires a Pentium or later.
6130 */
6131DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6132{
6133 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6134}
6135
6136
6137/**
6138 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
6139 *
6140 * @param pu32 Pointer to the variable to XOR @a u32 with.
6141 * @param u32 The value to XOR @a *pu32 with.
6142 *
6143 * @remarks x86: Requires a 386 or later.
6144 */
6145#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6146RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6147#else
6148DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6149{
6150# if RT_INLINE_ASM_USES_INTRIN
6151 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
6152
6153# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6154# if RT_INLINE_ASM_GNU_STYLE
6155 __asm__ __volatile__("lock; xorl %1, %0\n\t"
6156 : "=m" (*pu32)
6157 : "ir" (u32)
6158 , "m" (*pu32)
6159 : "cc");
6160# else
6161 __asm
6162 {
6163 mov eax, [u32]
6164# ifdef RT_ARCH_AMD64
6165 mov rdx, [pu32]
6166 lock xor [rdx], eax
6167# else
6168 mov edx, [pu32]
6169 lock xor [edx], eax
6170# endif
6171 }
6172# endif
6173
6174# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6175# if defined(RTASM_ARM64_USE_FEAT_LSE)
6176# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6177 uint32_t u32Spill;
6178 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6179 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
6180 : [pMem] "+Q" (*pu32)
6181 , [uSpill] "=&r" (u32Spill)
6182 : [fBitMask] "r" (u32)
6183 : );
6184# else
6185 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6186 RTASM_ARM_DMB_SY
6187 "steor %w[fBitMask], %[pMem]\n\t"
6188 : [pMem] "+Q" (*pu32)
6189 : [fBitMask] "r" (u32)
6190 : );
6191# endif
6192# else
6193 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
6194 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6195 "eor %[uNew], %[uNew], %[uVal]\n\t",
6196 [uVal] "r" (u32));
6197# endif
6198
6199# else
6200# error "Port me"
6201# endif
6202}
6203#endif
6204
6205
6206/**
6207 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
6208 * extended version (for bitmaps).
6209 *
6210 * @returns Old value.
6211 * @param pu32 Pointer to the variable to XOR @a u32 with.
6212 * @param u32 The value to XOR @a *pu32 with.
6213 */
6214DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6215{
6216# if RT_INLINE_ASM_USES_INTRIN
6217 return (uint32_t)_InterlockedXor((long volatile RT_FAR *)pu32, u32);
6218
6219#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6220# if defined(RTASM_ARM64_USE_FEAT_LSE)
6221 uint32_t u32OldRet;
6222 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
6223# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6224 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6225# else
6226 RTASM_ARM_DMB_SY
6227 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6228# endif
6229 : [pMem] "+Q" (*pu32)
6230 , [uOldRet] "=&r" (u32OldRet)
6231 : [fBitMask] "r" (u32)
6232 : );
6233# else
6234 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
6235 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6236 "eor %[uNew], %[uOld], %[uVal]\n\t",
6237 [uVal] "r" (u32));
6238# endif
6239 return u32OldRet;
6240
6241#else
6242 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6243 uint32_t u32New;
6244 do
6245 u32New = u32RetOld ^ u32;
6246 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6247 return u32RetOld;
6248#endif
6249}
6250
6251
6252/**
6253 * Atomically XOR a signed 32-bit value, ordered.
6254 *
6255 * @param pi32 Pointer to the variable to XOR i32 with.
6256 * @param i32 The value to XOR *pi32 with.
6257 *
6258 * @remarks x86: Requires a 386 or later.
6259 */
6260DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6261{
6262 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6263}
6264
6265
6266/**
6267 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
6268 *
6269 * @param pu32 Pointer to the pointer variable to OR u32 with.
6270 * @param u32 The value to OR *pu32 with.
6271 *
6272 * @remarks x86: Requires a 386 or later.
6273 */
6274#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6275RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6276#else
6277DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6278{
6279# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6280# if RT_INLINE_ASM_GNU_STYLE
6281 __asm__ __volatile__("orl %1, %0\n\t"
6282 : "=m" (*pu32)
6283 : "ir" (u32)
6284 , "m" (*pu32)
6285 : "cc");
6286# else
6287 __asm
6288 {
6289 mov eax, [u32]
6290# ifdef RT_ARCH_AMD64
6291 mov rdx, [pu32]
6292 or [rdx], eax
6293# else
6294 mov edx, [pu32]
6295 or [edx], eax
6296# endif
6297 }
6298# endif
6299
6300# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6301# if RT_INLINE_ASM_USES_INTRIN
6302 _InterlockedOr_nf((long volatile RT_FAR *)pu32, u32); /* similar to the non-lse code below */
6303
6304 /* M1 benchmark: stset=1974 vs non-lse=6271 */
6305# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6306 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
6307 "stset %w[fBitsToSet], %[pMem]\n\t"
6308 : [pMem] "+Q" (*pu32)
6309 : [fBitsToSet] "r" (u32)
6310 : );
6311# else
6312 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
6313 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
6314 "orr %[uNew], %[uNew], %[uVal]\n\t",
6315 [uVal] "r" (u32));
6316# endif
6317
6318# else
6319# error "Port me"
6320# endif
6321}
6322#endif
6323
6324
6325/**
6326 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
6327 * extended version (for bitmap fallback).
6328 *
6329 * @returns Old value.
6330 * @param pu32 Pointer to the variable to OR @a u32 with.
6331 * @param u32 The value to OR @a *pu32 with.
6332 */
6333DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6334{
6335#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6336# if RT_INLINE_ASM_USES_INTRIN
6337 return (uint32_t)_InterlockedOr_nf((long volatile RT_FAR *)pu32, u32); /* similar to the non-lse code below */
6338
6339# else
6340# if defined(RTASM_ARM64_USE_FEAT_LSE)
6341 uint32_t u32OldRet;
6342 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
6343 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
6344 : [pMem] "+Q" (*pu32)
6345 , [uOldRet] "=&r" (u32OldRet)
6346 : [fBitsToSet] "r" (u32)
6347 : );
6348# else
6349 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
6350 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
6351 "orr %[uNew], %[uOld], %[uVal]\n\t",
6352 [uVal] "r" (u32));
6353# endif
6354 return u32OldRet;
6355# endif
6356
6357#else
6358 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6359#endif
6360}
6361
6362
6363/**
6364 * Atomically OR a signed 32-bit value, unordered.
6365 *
6366 * @param pi32 Pointer to the pointer variable to OR u32 with.
6367 * @param i32 The value to OR *pu32 with.
6368 *
6369 * @remarks x86: Requires a 386 or later.
6370 */
6371DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6372{
6373 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6374}
6375
6376
6377/**
6378 * Atomically OR an unsigned 64-bit value, unordered.
6379 *
6380 * @param pu64 Pointer to the pointer variable to OR u64 with.
6381 * @param u64 The value to OR *pu64 with.
6382 *
6383 * @remarks x86: Requires a Pentium or later.
6384 */
6385#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6386DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6387#else
6388DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6389{
6390# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6391 __asm__ __volatile__("orq %1, %q0\n\t"
6392 : "=m" (*pu64)
6393 : "r" (u64)
6394 , "m" (*pu64)
6395 : "cc");
6396
6397# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6398# if RT_INLINE_ASM_USES_INTRIN
6399 _InterlockedOr64_nf((volatile int64_t *)pu64, (int64_t)u64); /* similar to the non-lse code below */
6400
6401# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6402 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
6403 "stset %[fBitsToSet], %[pMem]\n\t"
6404 : [pMem] "+Q" (*pu64)
6405 : [fBitsToSet] "r" (u64)
6406 : );
6407# else
6408 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
6409 "orr %[uNew], %[uNew], %[uVal]\n\t"
6410 ,
6411 "orr %[uNew], %[uNew], %[uVal]\n\t"
6412 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
6413 [uVal] "r" (u64));
6414# endif
6415
6416# else
6417 for (;;)
6418 {
6419 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6420 uint64_t u64New = u64Old | u64;
6421 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6422 break;
6423 ASMNopPause();
6424 }
6425# endif
6426}
6427#endif
6428
6429
6430/**
6431 * Atomically Or a signed 64-bit value, unordered.
6432 *
6433 * @param pi64 Pointer to the pointer variable to OR u64 with.
6434 * @param i64 The value to OR *pu64 with.
6435 *
6436 * @remarks x86: Requires a Pentium or later.
6437 */
6438DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6439{
6440 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6441}
6442
6443
6444/**
6445 * Atomically And an unsigned 32-bit value, unordered.
6446 *
6447 * @param pu32 Pointer to the pointer variable to AND u32 with.
6448 * @param u32 The value to AND *pu32 with.
6449 *
6450 * @remarks x86: Requires a 386 or later.
6451 */
6452#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6453RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6454#else
6455DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6456{
6457# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6458# if RT_INLINE_ASM_GNU_STYLE
6459 __asm__ __volatile__("andl %1, %0\n\t"
6460 : "=m" (*pu32)
6461 : "ir" (u32)
6462 , "m" (*pu32)
6463 : "cc");
6464# else
6465 __asm
6466 {
6467 mov eax, [u32]
6468# ifdef RT_ARCH_AMD64
6469 mov rdx, [pu32]
6470 and [rdx], eax
6471# else
6472 mov edx, [pu32]
6473 and [edx], eax
6474# endif
6475 }
6476# endif
6477
6478# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6479# if RT_INLINE_ASM_USES_INTRIN
6480 _InterlockedAnd_nf((volatile long *)pu32, (long)u32); /* similar to the non-lse code below */
6481# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6482 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
6483 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
6484 "stclr %w[fBitsToClear], %[pMem]\n\t"
6485 : [pMem] "+Q" (*pu32)
6486 : [fBitsToClear] "r" (~u32)
6487 : );
6488# else
6489 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
6490 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
6491 "and %[uNew], %[uNew], %[uVal]\n\t",
6492 [uVal] "r" (u32));
6493# endif
6494
6495# else
6496# error "Port me"
6497# endif
6498}
6499#endif
6500
6501
6502/**
6503 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
6504 * bitmap fallback).
6505 *
6506 * @returns Old value.
6507 * @param pu32 Pointer to the pointer to AND @a u32 with.
6508 * @param u32 The value to AND @a *pu32 with.
6509 */
6510DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6511{
6512#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6513# if RT_INLINE_ASM_USES_INTRIN
6514 return (uint32_t)_InterlockedAnd_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6515# else
6516# if defined(RTASM_ARM64_USE_FEAT_LSE)
6517 uint32_t u32OldRet;
6518 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6519 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6520 : [pMem] "+Q" (*pu32)
6521 , [uOldRet] "=&r" (u32OldRet)
6522 : [fBitsToClear] "r" (~u32)
6523 : );
6524# else
6525 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
6526 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6527 "and %[uNew], %[uOld], %[uVal]\n\t",
6528 [uVal] "r" (u32));
6529# endif
6530 return u32OldRet;
6531# endif
6532
6533#else
6534 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6535#endif
6536}
6537
6538
6539/**
6540 * Atomically And a signed 32-bit value, unordered.
6541 *
6542 * @param pi32 Pointer to the pointer variable to AND i32 with.
6543 * @param i32 The value to AND *pi32 with.
6544 *
6545 * @remarks x86: Requires a 386 or later.
6546 */
6547DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6548{
6549 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6550}
6551
6552
6553/**
6554 * Atomically And an unsigned 64-bit value, unordered.
6555 *
6556 * @param pu64 Pointer to the pointer variable to AND u64 with.
6557 * @param u64 The value to AND *pu64 with.
6558 *
6559 * @remarks x86: Requires a Pentium or later.
6560 */
6561#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6562DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6563#else
6564DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6565{
6566# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6567 __asm__ __volatile__("andq %1, %0\n\t"
6568 : "=m" (*pu64)
6569 : "r" (u64)
6570 , "m" (*pu64)
6571 : "cc");
6572
6573# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6574# if RT_INLINE_ASM_USES_INTRIN
6575 _InterlockedAnd64_nf((volatile int64_t *)pu64, (int64_t)u64); /* similar code to the non-lse case below */
6576
6577# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6578 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
6579 "stclr %[fBitsToClear], %[pMem]\n\t"
6580 : [pMem] "+Q" (*pu64)
6581 : [fBitsToClear] "r" (~u64)
6582 : );
6583# else
6584 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
6585 "and %[uNew], %[uNew], %[uVal]\n\t"
6586 ,
6587 "and %[uNew], %[uNew], %[uVal]\n\t"
6588 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6589 [uVal] "r" (u64));
6590# endif
6591
6592# else
6593 for (;;)
6594 {
6595 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6596 uint64_t u64New = u64Old & u64;
6597 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6598 break;
6599 ASMNopPause();
6600 }
6601# endif
6602}
6603#endif
6604
6605
6606/**
6607 * Atomically And a signed 64-bit value, unordered.
6608 *
6609 * @param pi64 Pointer to the pointer variable to AND i64 with.
6610 * @param i64 The value to AND *pi64 with.
6611 *
6612 * @remarks x86: Requires a Pentium or later.
6613 */
6614DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6615{
6616 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6617}
6618
6619
6620/**
6621 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
6622 *
6623 * @param pu32 Pointer to the variable to XOR @a u32 with.
6624 * @param u32 The value to OR @a *pu32 with.
6625 *
6626 * @remarks x86: Requires a 386 or later.
6627 */
6628#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6629RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6630#else
6631DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6632{
6633# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6634# if RT_INLINE_ASM_GNU_STYLE
6635 __asm__ __volatile__("xorl %1, %0\n\t"
6636 : "=m" (*pu32)
6637 : "ir" (u32)
6638 , "m" (*pu32)
6639 : "cc");
6640# else
6641 __asm
6642 {
6643 mov eax, [u32]
6644# ifdef RT_ARCH_AMD64
6645 mov rdx, [pu32]
6646 xor [rdx], eax
6647# else
6648 mov edx, [pu32]
6649 xor [edx], eax
6650# endif
6651 }
6652# endif
6653
6654# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6655# if RT_INLINE_ASM_USES_INTRIN
6656 _InterlockedXor_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6657# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6658 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6659 "steor %w[fBitMask], %[pMem]\n\t"
6660 : [pMem] "+Q" (*pu32)
6661 : [fBitMask] "r" (u32)
6662 : );
6663# else
6664 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6665 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6666 "eor %[uNew], %[uNew], %[uVal]\n\t",
6667 [uVal] "r" (u32));
6668# endif
6669
6670# else
6671# error "Port me"
6672# endif
6673}
6674#endif
6675
6676
6677/**
6678 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6679 * extended version (for bitmap fallback).
6680 *
6681 * @returns Old value.
6682 * @param pu32 Pointer to the variable to XOR @a u32 with.
6683 * @param u32 The value to OR @a *pu32 with.
6684 */
6685DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6686{
6687#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6688# if RT_INLINE_ASM_USES_INTRIN
6689 return (uint32_t)_InterlockedXor_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6690# else
6691# if defined(RTASM_ARM64_USE_FEAT_LSE)
6692 uint32_t u32OldRet;
6693 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6694 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6695 : [pMem] "+Q" (*pu32)
6696 , [uOldRet] "=&r" (u32OldRet)
6697 : [fBitMask] "r" (u32)
6698 : );
6699# else
6700 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6701 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6702 "eor %[uNew], %[uOld], %[uVal]\n\t",
6703 [uVal] "r" (u32));
6704# endif
6705 return u32OldRet;
6706# endif
6707
6708#else
6709 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6710#endif
6711}
6712
6713
6714/**
6715 * Atomically XOR a signed 32-bit value, unordered.
6716 *
6717 * @param pi32 Pointer to the variable to XOR @a u32 with.
6718 * @param i32 The value to XOR @a *pu32 with.
6719 *
6720 * @remarks x86: Requires a 386 or later.
6721 */
6722DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6723{
6724 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6725}
6726
6727
6728/**
6729 * Atomically increment an unsigned 32-bit value, unordered.
6730 *
6731 * @returns the new value.
6732 * @param pu32 Pointer to the variable to increment.
6733 *
6734 * @remarks x86: Requires a 486 or later.
6735 */
6736#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6737RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6738#else
6739DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6740{
6741# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6742 uint32_t u32;
6743# if RT_INLINE_ASM_GNU_STYLE
6744 __asm__ __volatile__("xaddl %0, %1\n\t"
6745 : "=r" (u32)
6746 , "=m" (*pu32)
6747 : "0" (1)
6748 , "m" (*pu32)
6749 : "memory" /** @todo why 'memory'? */
6750 , "cc");
6751 return u32 + 1;
6752# else
6753 __asm
6754 {
6755 mov eax, 1
6756# ifdef RT_ARCH_AMD64
6757 mov rdx, [pu32]
6758 xadd [rdx], eax
6759# else
6760 mov edx, [pu32]
6761 xadd [edx], eax
6762# endif
6763 mov u32, eax
6764 }
6765 return u32 + 1;
6766# endif
6767
6768# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6769# if RT_INLINE_ASM_USES_INTRIN
6770 return _InterlockedIncrement_nf((volatile long *)pu32); /* generates code similar to the non-lse case below */
6771# else
6772 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6773# if defined(RTASM_ARM64_USE_FEAT_LSE)
6774 uint32_t u32NewRet;
6775 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6776 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6777 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6778 : [pMem] "+Q" (*pu32)
6779 , [uNewRet] "=&r" (u32NewRet)
6780 : [uAddend] "r" ((uint32_t)1)
6781 : );
6782# else
6783 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6784 "add %w[uNew], %w[uNew], #1\n\t",
6785 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6786 "X" (0) /* dummy */);
6787# endif
6788 return u32NewRet;
6789# endif
6790
6791# else
6792# error "Port me"
6793# endif
6794}
6795#endif
6796
6797
6798/**
6799 * Atomically decrement an unsigned 32-bit value, unordered.
6800 *
6801 * @returns the new value.
6802 * @param pu32 Pointer to the variable to decrement.
6803 *
6804 * @remarks x86: Requires a 486 or later.
6805 */
6806#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6807RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6808#else
6809DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6810{
6811# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6812 uint32_t u32;
6813# if RT_INLINE_ASM_GNU_STYLE
6814 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6815 : "=r" (u32)
6816 , "=m" (*pu32)
6817 : "0" (-1)
6818 , "m" (*pu32)
6819 : "memory"
6820 , "cc");
6821 return u32 - 1;
6822# else
6823 __asm
6824 {
6825 mov eax, -1
6826# ifdef RT_ARCH_AMD64
6827 mov rdx, [pu32]
6828 xadd [rdx], eax
6829# else
6830 mov edx, [pu32]
6831 xadd [edx], eax
6832# endif
6833 mov u32, eax
6834 }
6835 return u32 - 1;
6836# endif
6837
6838# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6839# if RT_INLINE_ASM_USES_INTRIN
6840 return _InterlockedDecrement_nf((volatile long *)pu32); /* generates code similar to the non-lse case below */
6841# else
6842 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6843# if defined(RTASM_ARM64_USE_FEAT_LSE)
6844 uint32_t u32NewRet;
6845 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6846 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6847 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6848 : [pMem] "+Q" (*pu32)
6849 , [uNewRet] "=&r" (u32NewRet)
6850 : [uAddend] "r" (~(uint32_t)0)
6851 : );
6852# else
6853 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6854 "sub %w[uNew], %w[uNew], #1\n\t",
6855 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6856 "X" (0) /* dummy */);
6857# endif
6858 return u32NewRet;
6859# endif
6860
6861# else
6862# error "Port me"
6863# endif
6864}
6865#endif
6866
6867/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6868 * header as it's a common reason for including asm.h. */
6869
6870
6871/**
6872 * Reverse the byte order of the given 16-bit integer.
6873 *
6874 * @returns Revert
6875 * @param u16 16-bit integer value.
6876 */
6877#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6878RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6879#else
6880DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6881{
6882# if RT_INLINE_ASM_USES_INTRIN
6883 return _byteswap_ushort(u16);
6884
6885# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6886# if RT_INLINE_ASM_GNU_STYLE
6887 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6888# else
6889 _asm
6890 {
6891 mov ax, [u16]
6892 ror ax, 8
6893 mov [u16], ax
6894 }
6895# endif
6896 return u16;
6897
6898# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6899 uint32_t u32Ret;
6900 __asm__ __volatile__(
6901# if defined(RT_ARCH_ARM64)
6902 "rev16 %w[uRet], %w[uVal]\n\t"
6903# else
6904 "rev16 %[uRet], %[uVal]\n\t"
6905# endif
6906 : [uRet] "=r" (u32Ret)
6907 : [uVal] "r" (u16));
6908 return (uint16_t)u32Ret;
6909
6910# else
6911# error "Port me"
6912# endif
6913}
6914#endif
6915
6916
6917/**
6918 * Reverse the byte order of the given 32-bit integer.
6919 *
6920 * @returns Revert
6921 * @param u32 32-bit integer value.
6922 */
6923#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6924RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6925#else
6926DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6927{
6928# if RT_INLINE_ASM_USES_INTRIN
6929 return _byteswap_ulong(u32);
6930
6931# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6932# if RT_INLINE_ASM_GNU_STYLE
6933 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6934# else
6935 _asm
6936 {
6937 mov eax, [u32]
6938 bswap eax
6939 mov [u32], eax
6940 }
6941# endif
6942 return u32;
6943
6944# elif defined(RT_ARCH_ARM64)
6945 uint64_t u64Ret;
6946 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6947 : [uRet] "=r" (u64Ret)
6948 : [uVal] "r" ((uint64_t)u32));
6949 return (uint32_t)u64Ret;
6950
6951# elif defined(RT_ARCH_ARM32)
6952 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6953 : [uRet] "=r" (u32)
6954 : [uVal] "[uRet]" (u32));
6955 return u32;
6956
6957# else
6958# error "Port me"
6959# endif
6960}
6961#endif
6962
6963
6964/**
6965 * Reverse the byte order of the given 64-bit integer.
6966 *
6967 * @returns Revert
6968 * @param u64 64-bit integer value.
6969 */
6970DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
6971{
6972#if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32))
6973 return _byteswap_uint64(u64);
6974
6975# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6976 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
6977 return u64;
6978
6979# elif defined(RT_ARCH_ARM64)
6980 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6981 : [uRet] "=r" (u64)
6982 : [uVal] "[uRet]" (u64));
6983 return u64;
6984
6985#else
6986 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
6987 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
6988#endif
6989}
6990
6991
6992
6993/** @defgroup grp_inline_bits Bitmap Operations
6994 *
6995 * @todo Move these into a separate header, with standard IPRT prefix
6996 * (RTBitmapXxx). Move the more complex (searched) stuff into C source
6997 * files.
6998 *
6999 * @{
7000 */
7001
7002
7003/**
7004 * Sets a bit in a bitmap.
7005 *
7006 * @param pvBitmap Pointer to the bitmap (little endian). This should be
7007 * 32-bit aligned.
7008 * @param iBit The bit to set.
7009 *
7010 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7011 * However, doing so will yield better performance as well as avoiding
7012 * traps accessing the last bits in the bitmap.
7013 */
7014#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7015RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7016#else
7017DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7018{
7019# if RT_INLINE_ASM_USES_INTRIN
7020 _bittestandset((long RT_FAR *)pvBitmap, iBit);
7021
7022# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7023# if RT_INLINE_ASM_GNU_STYLE
7024 __asm__ __volatile__("btsl %1, %0"
7025 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7026 : "Ir" (iBit)
7027 , "m" (*(volatile long RT_FAR *)pvBitmap)
7028 : "memory"
7029 , "cc");
7030# else
7031 __asm
7032 {
7033# ifdef RT_ARCH_AMD64
7034 mov rax, [pvBitmap]
7035 mov edx, [iBit]
7036 bts [rax], edx
7037# else
7038 mov eax, [pvBitmap]
7039 mov edx, [iBit]
7040 bts [eax], edx
7041# endif
7042 }
7043# endif
7044
7045# else
7046 int32_t offBitmap = iBit / 32;
7047 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7048 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7049# endif
7050}
7051#endif
7052
7053
7054/**
7055 * Atomically sets a bit in a bitmap, ordered.
7056 *
7057 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7058 * aligned, otherwise the memory access isn't atomic!
7059 * @param iBit The bit to set.
7060 *
7061 * @remarks x86: Requires a 386 or later.
7062 */
7063#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7064RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7065#else
7066DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7067{
7068 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7069# if RT_INLINE_ASM_USES_INTRIN
7070 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7071
7072# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7073# if RT_INLINE_ASM_GNU_STYLE
7074 __asm__ __volatile__("lock; btsl %1, %0"
7075 : "=m" (*(volatile long *)pvBitmap)
7076 : "Ir" (iBit)
7077 , "m" (*(volatile long *)pvBitmap)
7078 : "memory"
7079 , "cc");
7080# else
7081 __asm
7082 {
7083# ifdef RT_ARCH_AMD64
7084 mov rax, [pvBitmap]
7085 mov edx, [iBit]
7086 lock bts [rax], edx
7087# else
7088 mov eax, [pvBitmap]
7089 mov edx, [iBit]
7090 lock bts [eax], edx
7091# endif
7092 }
7093# endif
7094
7095# else
7096 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7097# endif
7098}
7099#endif
7100
7101
7102/**
7103 * Clears a bit in a bitmap.
7104 *
7105 * @param pvBitmap Pointer to the bitmap (little endian).
7106 * @param iBit The bit to clear.
7107 *
7108 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7109 * However, doing so will yield better performance as well as avoiding
7110 * traps accessing the last bits in the bitmap.
7111 */
7112#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7113RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7114#else
7115DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7116{
7117# if RT_INLINE_ASM_USES_INTRIN
7118 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7119
7120# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7121# if RT_INLINE_ASM_GNU_STYLE
7122 __asm__ __volatile__("btrl %1, %0"
7123 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7124 : "Ir" (iBit)
7125 , "m" (*(volatile long RT_FAR *)pvBitmap)
7126 : "memory"
7127 , "cc");
7128# else
7129 __asm
7130 {
7131# ifdef RT_ARCH_AMD64
7132 mov rax, [pvBitmap]
7133 mov edx, [iBit]
7134 btr [rax], edx
7135# else
7136 mov eax, [pvBitmap]
7137 mov edx, [iBit]
7138 btr [eax], edx
7139# endif
7140 }
7141# endif
7142
7143# else
7144 int32_t offBitmap = iBit / 32;
7145 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7146 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7147# endif
7148}
7149#endif
7150
7151
7152/**
7153 * Atomically clears a bit in a bitmap, ordered.
7154 *
7155 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7156 * aligned, otherwise the memory access isn't atomic!
7157 * @param iBit The bit to toggle set.
7158 *
7159 * @remarks No memory barrier, take care on smp.
7160 * @remarks x86: Requires a 386 or later.
7161 */
7162#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7163RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7164#else
7165DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7166{
7167 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7168# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7169# if RT_INLINE_ASM_GNU_STYLE
7170 __asm__ __volatile__("lock; btrl %1, %0"
7171 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7172 : "Ir" (iBit)
7173 , "m" (*(volatile long RT_FAR *)pvBitmap)
7174 : "memory"
7175 , "cc");
7176# else
7177 __asm
7178 {
7179# ifdef RT_ARCH_AMD64
7180 mov rax, [pvBitmap]
7181 mov edx, [iBit]
7182 lock btr [rax], edx
7183# else
7184 mov eax, [pvBitmap]
7185 mov edx, [iBit]
7186 lock btr [eax], edx
7187# endif
7188 }
7189# endif
7190# else
7191 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7192# endif
7193}
7194#endif
7195
7196
7197/**
7198 * Toggles a bit in a bitmap.
7199 *
7200 * @param pvBitmap Pointer to the bitmap (little endian).
7201 * @param iBit The bit to toggle.
7202 *
7203 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7204 * However, doing so will yield better performance as well as avoiding
7205 * traps accessing the last bits in the bitmap.
7206 */
7207#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7208RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7209#else
7210DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7211{
7212# if RT_INLINE_ASM_USES_INTRIN
7213 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7214# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7215# if RT_INLINE_ASM_GNU_STYLE
7216 __asm__ __volatile__("btcl %1, %0"
7217 : "=m" (*(volatile long *)pvBitmap)
7218 : "Ir" (iBit)
7219 , "m" (*(volatile long *)pvBitmap)
7220 : "memory"
7221 , "cc");
7222# else
7223 __asm
7224 {
7225# ifdef RT_ARCH_AMD64
7226 mov rax, [pvBitmap]
7227 mov edx, [iBit]
7228 btc [rax], edx
7229# else
7230 mov eax, [pvBitmap]
7231 mov edx, [iBit]
7232 btc [eax], edx
7233# endif
7234 }
7235# endif
7236# else
7237 int32_t offBitmap = iBit / 32;
7238 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7239 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7240# endif
7241}
7242#endif
7243
7244
7245/**
7246 * Atomically toggles a bit in a bitmap, ordered.
7247 *
7248 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7249 * aligned, otherwise the memory access isn't atomic!
7250 * @param iBit The bit to test and set.
7251 *
7252 * @remarks x86: Requires a 386 or later.
7253 */
7254#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7255RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7256#else
7257DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7258{
7259 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7260# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7261# if RT_INLINE_ASM_GNU_STYLE
7262 __asm__ __volatile__("lock; btcl %1, %0"
7263 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7264 : "Ir" (iBit)
7265 , "m" (*(volatile long RT_FAR *)pvBitmap)
7266 : "memory"
7267 , "cc");
7268# else
7269 __asm
7270 {
7271# ifdef RT_ARCH_AMD64
7272 mov rax, [pvBitmap]
7273 mov edx, [iBit]
7274 lock btc [rax], edx
7275# else
7276 mov eax, [pvBitmap]
7277 mov edx, [iBit]
7278 lock btc [eax], edx
7279# endif
7280 }
7281# endif
7282# else
7283 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7284# endif
7285}
7286#endif
7287
7288
7289/**
7290 * Tests and sets a bit in a bitmap.
7291 *
7292 * @returns true if the bit was set.
7293 * @returns false if the bit was clear.
7294 *
7295 * @param pvBitmap Pointer to the bitmap (little endian).
7296 * @param iBit The bit to test and set.
7297 *
7298 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7299 * However, doing so will yield better performance as well as avoiding
7300 * traps accessing the last bits in the bitmap.
7301 */
7302#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7303RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7304#else
7305DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7306{
7307 union { bool f; uint32_t u32; uint8_t u8; } rc;
7308# if RT_INLINE_ASM_USES_INTRIN
7309 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
7310
7311# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7312# if RT_INLINE_ASM_GNU_STYLE
7313 __asm__ __volatile__("btsl %2, %1\n\t"
7314 "setc %b0\n\t"
7315 "andl $1, %0\n\t"
7316 : "=q" (rc.u32)
7317 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7318 : "Ir" (iBit)
7319 , "m" (*(volatile long RT_FAR *)pvBitmap)
7320 : "memory"
7321 , "cc");
7322# else
7323 __asm
7324 {
7325 mov edx, [iBit]
7326# ifdef RT_ARCH_AMD64
7327 mov rax, [pvBitmap]
7328 bts [rax], edx
7329# else
7330 mov eax, [pvBitmap]
7331 bts [eax], edx
7332# endif
7333 setc al
7334 and eax, 1
7335 mov [rc.u32], eax
7336 }
7337# endif
7338
7339# else
7340 int32_t offBitmap = iBit / 32;
7341 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7342 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7343 >> (iBit & 31);
7344 rc.u32 &= 1;
7345# endif
7346 return rc.f;
7347}
7348#endif
7349
7350
7351/**
7352 * Atomically tests and sets a bit in a bitmap, ordered.
7353 *
7354 * @returns true if the bit was set.
7355 * @returns false if the bit was clear.
7356 *
7357 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7358 * aligned, otherwise the memory access isn't atomic!
7359 * @param iBit The bit to set.
7360 *
7361 * @remarks x86: Requires a 386 or later.
7362 */
7363#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7364RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7365#else
7366DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7367{
7368 union { bool f; uint32_t u32; uint8_t u8; } rc;
7369 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7370# if RT_INLINE_ASM_USES_INTRIN
7371 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7372# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7373# if RT_INLINE_ASM_GNU_STYLE
7374 __asm__ __volatile__("lock; btsl %2, %1\n\t"
7375 "setc %b0\n\t"
7376 "andl $1, %0\n\t"
7377 : "=q" (rc.u32)
7378 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7379 : "Ir" (iBit)
7380 , "m" (*(volatile long RT_FAR *)pvBitmap)
7381 : "memory"
7382 , "cc");
7383# else
7384 __asm
7385 {
7386 mov edx, [iBit]
7387# ifdef RT_ARCH_AMD64
7388 mov rax, [pvBitmap]
7389 lock bts [rax], edx
7390# else
7391 mov eax, [pvBitmap]
7392 lock bts [eax], edx
7393# endif
7394 setc al
7395 and eax, 1
7396 mov [rc.u32], eax
7397 }
7398# endif
7399
7400# else
7401 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7402 >> (iBit & 31);
7403 rc.u32 &= 1;
7404# endif
7405 return rc.f;
7406}
7407#endif
7408
7409
7410/**
7411 * Tests and clears a bit in a bitmap.
7412 *
7413 * @returns true if the bit was set.
7414 * @returns false if the bit was clear.
7415 *
7416 * @param pvBitmap Pointer to the bitmap (little endian).
7417 * @param iBit The bit to test and clear.
7418 *
7419 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7420 * However, doing so will yield better performance as well as avoiding
7421 * traps accessing the last bits in the bitmap.
7422 */
7423#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7424RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7425#else
7426DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7427{
7428 union { bool f; uint32_t u32; uint8_t u8; } rc;
7429# if RT_INLINE_ASM_USES_INTRIN
7430 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7431
7432# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7433# if RT_INLINE_ASM_GNU_STYLE
7434 __asm__ __volatile__("btrl %2, %1\n\t"
7435 "setc %b0\n\t"
7436 "andl $1, %0\n\t"
7437 : "=q" (rc.u32)
7438 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7439 : "Ir" (iBit)
7440 , "m" (*(volatile long RT_FAR *)pvBitmap)
7441 : "memory"
7442 , "cc");
7443# else
7444 __asm
7445 {
7446 mov edx, [iBit]
7447# ifdef RT_ARCH_AMD64
7448 mov rax, [pvBitmap]
7449 btr [rax], edx
7450# else
7451 mov eax, [pvBitmap]
7452 btr [eax], edx
7453# endif
7454 setc al
7455 and eax, 1
7456 mov [rc.u32], eax
7457 }
7458# endif
7459
7460# else
7461 int32_t offBitmap = iBit / 32;
7462 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7463 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7464 >> (iBit & 31);
7465 rc.u32 &= 1;
7466# endif
7467 return rc.f;
7468}
7469#endif
7470
7471
7472/**
7473 * Atomically tests and clears a bit in a bitmap, ordered.
7474 *
7475 * @returns true if the bit was set.
7476 * @returns false if the bit was clear.
7477 *
7478 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7479 * aligned, otherwise the memory access isn't atomic!
7480 * @param iBit The bit to test and clear.
7481 *
7482 * @remarks No memory barrier, take care on smp.
7483 * @remarks x86: Requires a 386 or later.
7484 */
7485#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7486RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7487#else
7488DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7489{
7490 union { bool f; uint32_t u32; uint8_t u8; } rc;
7491 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7492# if RT_INLINE_ASM_USES_INTRIN
7493 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
7494
7495# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7496# if RT_INLINE_ASM_GNU_STYLE
7497 __asm__ __volatile__("lock; btrl %2, %1\n\t"
7498 "setc %b0\n\t"
7499 "andl $1, %0\n\t"
7500 : "=q" (rc.u32)
7501 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7502 : "Ir" (iBit)
7503 , "m" (*(volatile long RT_FAR *)pvBitmap)
7504 : "memory"
7505 , "cc");
7506# else
7507 __asm
7508 {
7509 mov edx, [iBit]
7510# ifdef RT_ARCH_AMD64
7511 mov rax, [pvBitmap]
7512 lock btr [rax], edx
7513# else
7514 mov eax, [pvBitmap]
7515 lock btr [eax], edx
7516# endif
7517 setc al
7518 and eax, 1
7519 mov [rc.u32], eax
7520 }
7521# endif
7522
7523# else
7524 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7525 >> (iBit & 31);
7526 rc.u32 &= 1;
7527# endif
7528 return rc.f;
7529}
7530#endif
7531
7532
7533/**
7534 * Tests and toggles a bit in a bitmap.
7535 *
7536 * @returns true if the bit was set.
7537 * @returns false if the bit was clear.
7538 *
7539 * @param pvBitmap Pointer to the bitmap (little endian).
7540 * @param iBit The bit to test and toggle.
7541 *
7542 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7543 * However, doing so will yield better performance as well as avoiding
7544 * traps accessing the last bits in the bitmap.
7545 */
7546#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7547RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7548#else
7549DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7550{
7551 union { bool f; uint32_t u32; uint8_t u8; } rc;
7552# if RT_INLINE_ASM_USES_INTRIN
7553 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7554
7555# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7556# if RT_INLINE_ASM_GNU_STYLE
7557 __asm__ __volatile__("btcl %2, %1\n\t"
7558 "setc %b0\n\t"
7559 "andl $1, %0\n\t"
7560 : "=q" (rc.u32)
7561 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7562 : "Ir" (iBit)
7563 , "m" (*(volatile long RT_FAR *)pvBitmap)
7564 : "memory"
7565 , "cc");
7566# else
7567 __asm
7568 {
7569 mov edx, [iBit]
7570# ifdef RT_ARCH_AMD64
7571 mov rax, [pvBitmap]
7572 btc [rax], edx
7573# else
7574 mov eax, [pvBitmap]
7575 btc [eax], edx
7576# endif
7577 setc al
7578 and eax, 1
7579 mov [rc.u32], eax
7580 }
7581# endif
7582
7583# else
7584 int32_t offBitmap = iBit / 32;
7585 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7586 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7587 >> (iBit & 31);
7588 rc.u32 &= 1;
7589# endif
7590 return rc.f;
7591}
7592#endif
7593
7594
7595/**
7596 * Atomically tests and toggles a bit in a bitmap, ordered.
7597 *
7598 * @returns true if the bit was set.
7599 * @returns false if the bit was clear.
7600 *
7601 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7602 * aligned, otherwise the memory access isn't atomic!
7603 * @param iBit The bit to test and toggle.
7604 *
7605 * @remarks x86: Requires a 386 or later.
7606 */
7607#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7608RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7609#else
7610DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7611{
7612 union { bool f; uint32_t u32; uint8_t u8; } rc;
7613 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7614# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7615# if RT_INLINE_ASM_GNU_STYLE
7616 __asm__ __volatile__("lock; btcl %2, %1\n\t"
7617 "setc %b0\n\t"
7618 "andl $1, %0\n\t"
7619 : "=q" (rc.u32)
7620 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7621 : "Ir" (iBit)
7622 , "m" (*(volatile long RT_FAR *)pvBitmap)
7623 : "memory"
7624 , "cc");
7625# else
7626 __asm
7627 {
7628 mov edx, [iBit]
7629# ifdef RT_ARCH_AMD64
7630 mov rax, [pvBitmap]
7631 lock btc [rax], edx
7632# else
7633 mov eax, [pvBitmap]
7634 lock btc [eax], edx
7635# endif
7636 setc al
7637 and eax, 1
7638 mov [rc.u32], eax
7639 }
7640# endif
7641
7642# else
7643 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
7644 >> (iBit & 31);
7645 rc.u32 &= 1;
7646# endif
7647 return rc.f;
7648}
7649#endif
7650
7651
7652/**
7653 * Tests if a bit in a bitmap is set.
7654 *
7655 * @returns true if the bit is set.
7656 * @returns false if the bit is clear.
7657 *
7658 * @param pvBitmap Pointer to the bitmap (little endian).
7659 * @param iBit The bit to test.
7660 *
7661 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7662 * However, doing so will yield better performance as well as avoiding
7663 * traps accessing the last bits in the bitmap.
7664 */
7665#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7666RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7667#else
7668DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7669{
7670 union { bool f; uint32_t u32; uint8_t u8; } rc;
7671# if RT_INLINE_ASM_USES_INTRIN
7672 rc.u32 = _bittest((long *)pvBitmap, iBit);
7673
7674# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7675# if RT_INLINE_ASM_GNU_STYLE
7676
7677 __asm__ __volatile__("btl %2, %1\n\t"
7678 "setc %b0\n\t"
7679 "andl $1, %0\n\t"
7680 : "=q" (rc.u32)
7681 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7682 , "Ir" (iBit)
7683 : "memory"
7684 , "cc");
7685# else
7686 __asm
7687 {
7688 mov edx, [iBit]
7689# ifdef RT_ARCH_AMD64
7690 mov rax, [pvBitmap]
7691 bt [rax], edx
7692# else
7693 mov eax, [pvBitmap]
7694 bt [eax], edx
7695# endif
7696 setc al
7697 and eax, 1
7698 mov [rc.u32], eax
7699 }
7700# endif
7701
7702# else
7703 int32_t offBitmap = iBit / 32;
7704 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7705 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7706 rc.u32 &= 1;
7707# endif
7708 return rc.f;
7709}
7710#endif
7711
7712
7713#ifdef IPRT_INCLUDED_asm_mem_h
7714
7715/**
7716 * Clears a bit range within a bitmap.
7717 *
7718 * @param pvBitmap Pointer to the bitmap (little endian).
7719 * @param iBitStart The First bit to clear.
7720 * @param iBitEnd The first bit not to clear.
7721 */
7722DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7723{
7724 if (iBitStart < iBitEnd)
7725 {
7726 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7727 size_t iStart = iBitStart & ~(size_t)31;
7728 size_t iEnd = iBitEnd & ~(size_t)31;
7729 if (iStart == iEnd)
7730 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7731 else
7732 {
7733 /* bits in first dword. */
7734 if (iBitStart & 31)
7735 {
7736 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7737 pu32++;
7738 iBitStart = iStart + 32;
7739 }
7740
7741 /* whole dwords. */
7742 if (iBitStart != iEnd)
7743 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7744
7745 /* bits in last dword. */
7746 if (iBitEnd & 31)
7747 {
7748 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7749 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7750 }
7751 }
7752 }
7753}
7754
7755
7756/**
7757 * Sets a bit range within a bitmap.
7758 *
7759 * @param pvBitmap Pointer to the bitmap (little endian).
7760 * @param iBitStart The First bit to set.
7761 * @param iBitEnd The first bit not to set.
7762 */
7763DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7764{
7765 if (iBitStart < iBitEnd)
7766 {
7767 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7768 size_t iStart = iBitStart & ~(size_t)31;
7769 size_t iEnd = iBitEnd & ~(size_t)31;
7770 if (iStart == iEnd)
7771 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7772 else
7773 {
7774 /* bits in first dword. */
7775 if (iBitStart & 31)
7776 {
7777 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7778 pu32++;
7779 iBitStart = iStart + 32;
7780 }
7781
7782 /* whole dword. */
7783 if (iBitStart != iEnd)
7784 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7785
7786 /* bits in last dword. */
7787 if (iBitEnd & 31)
7788 {
7789 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7790 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7791 }
7792 }
7793 }
7794}
7795
7796#endif /* IPRT_INCLUDED_asm_mem_h */
7797
7798/**
7799 * Finds the first clear bit in a bitmap.
7800 *
7801 * @returns Index of the first zero bit.
7802 * @returns -1 if no clear bit was found.
7803 * @param pvBitmap Pointer to the bitmap (little endian).
7804 * @param cBits The number of bits in the bitmap. Multiple of 32.
7805 */
7806#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7807RT_DECL_ASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7808#else
7809DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7810{
7811 if (cBits)
7812 {
7813 int32_t iBit;
7814# if RT_INLINE_ASM_GNU_STYLE
7815 RTCCUINTREG uEAX, uECX, uEDI;
7816 cBits = RT_ALIGN_32(cBits, 32);
7817 __asm__ __volatile__("repe; scasl\n\t"
7818 "je 1f\n\t"
7819# ifdef RT_ARCH_AMD64
7820 "lea -4(%%rdi), %%rdi\n\t"
7821 "xorl (%%rdi), %%eax\n\t"
7822 "subq %5, %%rdi\n\t"
7823# else
7824 "lea -4(%%edi), %%edi\n\t"
7825 "xorl (%%edi), %%eax\n\t"
7826 "subl %5, %%edi\n\t"
7827# endif
7828 "shll $3, %%edi\n\t"
7829 "bsfl %%eax, %%edx\n\t"
7830 "addl %%edi, %%edx\n\t"
7831 "1:\t\n"
7832 : "=d" (iBit)
7833 , "=&c" (uECX)
7834 , "=&D" (uEDI)
7835 , "=&a" (uEAX)
7836 : "0" (0xffffffff)
7837 , "mr" (pvBitmap)
7838 , "1" (cBits >> 5)
7839 , "2" (pvBitmap)
7840 , "3" (0xffffffff)
7841 : "cc");
7842# else
7843 cBits = RT_ALIGN_32(cBits, 32);
7844 __asm
7845 {
7846# ifdef RT_ARCH_AMD64
7847 mov rdi, [pvBitmap]
7848 mov rbx, rdi
7849# else
7850 mov edi, [pvBitmap]
7851 mov ebx, edi
7852# endif
7853 mov edx, 0ffffffffh
7854 mov eax, edx
7855 mov ecx, [cBits]
7856 shr ecx, 5
7857 repe scasd
7858 je done
7859
7860# ifdef RT_ARCH_AMD64
7861 lea rdi, [rdi - 4]
7862 xor eax, [rdi]
7863 sub rdi, rbx
7864# else
7865 lea edi, [edi - 4]
7866 xor eax, [edi]
7867 sub edi, ebx
7868# endif
7869 shl edi, 3
7870 bsf edx, eax
7871 add edx, edi
7872 done:
7873 mov [iBit], edx
7874 }
7875# endif
7876 return iBit;
7877 }
7878 return -1;
7879}
7880#endif
7881
7882
7883/**
7884 * Finds the next clear bit in a bitmap.
7885 *
7886 * @returns Index of the first zero bit.
7887 * @returns -1 if no clear bit was found.
7888 * @param pvBitmap Pointer to the bitmap (little endian).
7889 * @param cBits The number of bits in the bitmap. Multiple of 32.
7890 * @param iBitPrev The bit returned from the last search.
7891 * The search will start at iBitPrev + 1.
7892 */
7893#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7894RT_DECL_ASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7895#else
7896DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7897{
7898 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7899 int iBit = ++iBitPrev & 31;
7900 if (iBit)
7901 {
7902 /*
7903 * Inspect the 32-bit word containing the unaligned bit.
7904 */
7905 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7906
7907# if RT_INLINE_ASM_USES_INTRIN
7908 unsigned long ulBit = 0;
7909 if (_BitScanForward(&ulBit, u32))
7910 return ulBit + iBitPrev;
7911# else
7912# if RT_INLINE_ASM_GNU_STYLE
7913 __asm__ __volatile__("bsf %1, %0\n\t"
7914 "jnz 1f\n\t"
7915 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7916 "1:\n\t"
7917 : "=r" (iBit)
7918 : "r" (u32)
7919 : "cc");
7920# else
7921 __asm
7922 {
7923 mov edx, [u32]
7924 bsf eax, edx
7925 jnz done
7926 mov eax, 0ffffffffh
7927 done:
7928 mov [iBit], eax
7929 }
7930# endif
7931 if (iBit >= 0)
7932 return iBit + (int)iBitPrev;
7933# endif
7934
7935 /*
7936 * Skip ahead and see if there is anything left to search.
7937 */
7938 iBitPrev |= 31;
7939 iBitPrev++;
7940 if (cBits <= (uint32_t)iBitPrev)
7941 return -1;
7942 }
7943
7944 /*
7945 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7946 */
7947 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7948 if (iBit >= 0)
7949 iBit += iBitPrev;
7950 return iBit;
7951}
7952#endif
7953
7954
7955/**
7956 * Finds the first set bit in a bitmap.
7957 *
7958 * @returns Index of the first set bit.
7959 * @returns -1 if no clear bit was found.
7960 * @param pvBitmap Pointer to the bitmap (little endian).
7961 * @param cBits The number of bits in the bitmap. Multiple of 32.
7962 */
7963#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7964RT_DECL_ASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7965#else
7966DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7967{
7968 if (cBits)
7969 {
7970 int32_t iBit;
7971# if RT_INLINE_ASM_GNU_STYLE
7972 RTCCUINTREG uEAX, uECX, uEDI;
7973 cBits = RT_ALIGN_32(cBits, 32);
7974 __asm__ __volatile__("repe; scasl\n\t"
7975 "je 1f\n\t"
7976# ifdef RT_ARCH_AMD64
7977 "lea -4(%%rdi), %%rdi\n\t"
7978 "movl (%%rdi), %%eax\n\t"
7979 "subq %5, %%rdi\n\t"
7980# else
7981 "lea -4(%%edi), %%edi\n\t"
7982 "movl (%%edi), %%eax\n\t"
7983 "subl %5, %%edi\n\t"
7984# endif
7985 "shll $3, %%edi\n\t"
7986 "bsfl %%eax, %%edx\n\t"
7987 "addl %%edi, %%edx\n\t"
7988 "1:\t\n"
7989 : "=d" (iBit)
7990 , "=&c" (uECX)
7991 , "=&D" (uEDI)
7992 , "=&a" (uEAX)
7993 : "0" (0xffffffff)
7994 , "mr" (pvBitmap)
7995 , "1" (cBits >> 5)
7996 , "2" (pvBitmap)
7997 , "3" (0)
7998 : "cc");
7999# else
8000 cBits = RT_ALIGN_32(cBits, 32);
8001 __asm
8002 {
8003# ifdef RT_ARCH_AMD64
8004 mov rdi, [pvBitmap]
8005 mov rbx, rdi
8006# else
8007 mov edi, [pvBitmap]
8008 mov ebx, edi
8009# endif
8010 mov edx, 0ffffffffh
8011 xor eax, eax
8012 mov ecx, [cBits]
8013 shr ecx, 5
8014 repe scasd
8015 je done
8016# ifdef RT_ARCH_AMD64
8017 lea rdi, [rdi - 4]
8018 mov eax, [rdi]
8019 sub rdi, rbx
8020# else
8021 lea edi, [edi - 4]
8022 mov eax, [edi]
8023 sub edi, ebx
8024# endif
8025 shl edi, 3
8026 bsf edx, eax
8027 add edx, edi
8028 done:
8029 mov [iBit], edx
8030 }
8031# endif
8032 return iBit;
8033 }
8034 return -1;
8035}
8036#endif
8037
8038
8039/**
8040 * Finds the next set bit in a bitmap.
8041 *
8042 * @returns Index of the next set bit.
8043 * @returns -1 if no set bit was found.
8044 * @param pvBitmap Pointer to the bitmap (little endian).
8045 * @param cBits The number of bits in the bitmap. Multiple of 32.
8046 * @param iBitPrev The bit returned from the last search.
8047 * The search will start at iBitPrev + 1.
8048 */
8049#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
8050RT_DECL_ASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
8051#else
8052DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
8053{
8054 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
8055 int iBit = ++iBitPrev & 31;
8056 if (iBit)
8057 {
8058 /*
8059 * Inspect the 32-bit word containing the unaligned bit.
8060 */
8061 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
8062
8063# if RT_INLINE_ASM_USES_INTRIN
8064 unsigned long ulBit = 0;
8065 if (_BitScanForward(&ulBit, u32))
8066 return ulBit + iBitPrev;
8067# else
8068# if RT_INLINE_ASM_GNU_STYLE
8069 __asm__ __volatile__("bsf %1, %0\n\t"
8070 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
8071 "movl $-1, %0\n\t"
8072 "1:\n\t"
8073 : "=r" (iBit)
8074 : "r" (u32)
8075 : "cc");
8076# else
8077 __asm
8078 {
8079 mov edx, [u32]
8080 bsf eax, edx
8081 jnz done
8082 mov eax, 0ffffffffh
8083 done:
8084 mov [iBit], eax
8085 }
8086# endif
8087 if (iBit >= 0)
8088 return iBit + (int)iBitPrev;
8089# endif
8090
8091 /*
8092 * Skip ahead and see if there is anything left to search.
8093 */
8094 iBitPrev |= 31;
8095 iBitPrev++;
8096 if (cBits <= (uint32_t)iBitPrev)
8097 return -1;
8098 }
8099
8100 /*
8101 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
8102 */
8103 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
8104 if (iBit >= 0)
8105 iBit += iBitPrev;
8106 return iBit;
8107}
8108#endif
8109
8110/** @} */
8111
8112
8113/** @defgroup grp_inline_bits Bitmap Operations
8114 * @{
8115 */
8116
8117/**
8118 * Finds the first bit which is set in the given 32-bit integer.
8119 * Bits are numbered from 1 (least significant) to 32.
8120 *
8121 * @returns index [1..32] of the first set bit.
8122 * @returns 0 if all bits are cleared.
8123 * @param u32 Integer to search for set bits.
8124 * @remarks Similar to ffs() in BSD.
8125 */
8126#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8127RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8128#else
8129DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
8130{
8131# if RT_INLINE_ASM_USES_INTRIN
8132 unsigned long iBit;
8133 if (_BitScanForward(&iBit, u32))
8134 iBit++;
8135 else
8136 iBit = 0;
8137
8138# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8139# if RT_INLINE_ASM_GNU_STYLE
8140 uint32_t iBit;
8141 __asm__ __volatile__("bsf %1, %0\n\t"
8142 "jnz 1f\n\t"
8143 "xorl %0, %0\n\t"
8144 "jmp 2f\n"
8145 "1:\n\t"
8146 "incl %0\n"
8147 "2:\n\t"
8148 : "=r" (iBit)
8149 : "rm" (u32)
8150 : "cc");
8151# else
8152 uint32_t iBit;
8153 _asm
8154 {
8155 bsf eax, [u32]
8156 jnz found
8157 xor eax, eax
8158 jmp done
8159 found:
8160 inc eax
8161 done:
8162 mov [iBit], eax
8163 }
8164# endif
8165
8166# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8167 /*
8168 * Using the "count leading zeros (clz)" instruction here because there
8169 * is no dedicated instruction to get the first set bit.
8170 * Need to reverse the bits in the value with "rbit" first because
8171 * "clz" starts counting from the most significant bit.
8172 */
8173 uint32_t iBit;
8174 __asm__ __volatile__(
8175# if defined(RT_ARCH_ARM64)
8176 "rbit %w[uVal], %w[uVal]\n\t"
8177 "clz %w[iBit], %w[uVal]\n\t"
8178# else
8179 "rbit %[uVal], %[uVal]\n\t"
8180 "clz %[iBit], %[uVal]\n\t"
8181# endif
8182 : [uVal] "=r" (u32)
8183 , [iBit] "=r" (iBit)
8184 : "[uVal]" (u32));
8185 if (iBit != 32)
8186 iBit++;
8187 else
8188 iBit = 0; /* No bit set. */
8189
8190# else
8191# error "Port me"
8192# endif
8193 return iBit;
8194}
8195#endif
8196
8197
8198/**
8199 * Finds the first bit which is set in the given 32-bit integer.
8200 * Bits are numbered from 1 (least significant) to 32.
8201 *
8202 * @returns index [1..32] of the first set bit.
8203 * @returns 0 if all bits are cleared.
8204 * @param i32 Integer to search for set bits.
8205 * @remark Similar to ffs() in BSD.
8206 */
8207DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
8208{
8209 return ASMBitFirstSetU32((uint32_t)i32);
8210}
8211
8212
8213/**
8214 * Finds the first bit which is set in the given 64-bit integer.
8215 *
8216 * Bits are numbered from 1 (least significant) to 64.
8217 *
8218 * @returns index [1..64] of the first set bit.
8219 * @returns 0 if all bits are cleared.
8220 * @param u64 Integer to search for set bits.
8221 * @remarks Similar to ffs() in BSD.
8222 */
8223#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8224RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8225#else
8226DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
8227{
8228# if RT_INLINE_ASM_USES_INTRIN
8229 unsigned long iBit;
8230# if ARCH_BITS == 64
8231 if (_BitScanForward64(&iBit, u64))
8232 iBit++;
8233 else
8234 iBit = 0;
8235# else
8236 if (_BitScanForward(&iBit, (uint32_t)u64))
8237 iBit++;
8238 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8239 iBit += 33;
8240 else
8241 iBit = 0;
8242# endif
8243
8244# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8245 uint64_t iBit;
8246 __asm__ __volatile__("bsfq %1, %0\n\t"
8247 "jnz 1f\n\t"
8248 "xorl %k0, %k0\n\t"
8249 "jmp 2f\n"
8250 "1:\n\t"
8251 "incl %k0\n"
8252 "2:\n\t"
8253 : "=r" (iBit)
8254 : "rm" (u64)
8255 : "cc");
8256
8257# elif defined(RT_ARCH_ARM64)
8258 uint64_t iBit;
8259 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8260 "clz %[iBit], %[uVal]\n\t"
8261 : [uVal] "=r" (u64)
8262 , [iBit] "=r" (iBit)
8263 : "[uVal]" (u64));
8264 if (iBit != 64)
8265 iBit++;
8266 else
8267 iBit = 0; /* No bit set. */
8268
8269# else
8270 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
8271 if (!iBit)
8272 {
8273 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
8274 if (iBit)
8275 iBit += 32;
8276 }
8277# endif
8278 return (unsigned)iBit;
8279}
8280#endif
8281
8282
8283/**
8284 * Finds the first bit which is set in the given 16-bit integer.
8285 *
8286 * Bits are numbered from 1 (least significant) to 16.
8287 *
8288 * @returns index [1..16] of the first set bit.
8289 * @returns 0 if all bits are cleared.
8290 * @param u16 Integer to search for set bits.
8291 * @remarks For 16-bit bs3kit code.
8292 */
8293#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8294RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8295#else
8296DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
8297{
8298 return ASMBitFirstSetU32((uint32_t)u16);
8299}
8300#endif
8301
8302
8303/**
8304 * Finds the last bit which is set in the given 32-bit integer.
8305 * Bits are numbered from 1 (least significant) to 32.
8306 *
8307 * @returns index [1..32] of the last set bit.
8308 * @returns 0 if all bits are cleared.
8309 * @param u32 Integer to search for set bits.
8310 * @remark Similar to fls() in BSD.
8311 */
8312#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8313RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8314#else
8315DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
8316{
8317# if RT_INLINE_ASM_USES_INTRIN
8318 unsigned long iBit;
8319 if (_BitScanReverse(&iBit, u32))
8320 iBit++;
8321 else
8322 iBit = 0;
8323
8324# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8325# if RT_INLINE_ASM_GNU_STYLE
8326 uint32_t iBit;
8327 __asm__ __volatile__("bsrl %1, %0\n\t"
8328 "jnz 1f\n\t"
8329 "xorl %0, %0\n\t"
8330 "jmp 2f\n"
8331 "1:\n\t"
8332 "incl %0\n"
8333 "2:\n\t"
8334 : "=r" (iBit)
8335 : "rm" (u32)
8336 : "cc");
8337# else
8338 uint32_t iBit;
8339 _asm
8340 {
8341 bsr eax, [u32]
8342 jnz found
8343 xor eax, eax
8344 jmp done
8345 found:
8346 inc eax
8347 done:
8348 mov [iBit], eax
8349 }
8350# endif
8351
8352# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8353 uint32_t iBit;
8354 __asm__ __volatile__(
8355# if defined(RT_ARCH_ARM64)
8356 "clz %w[iBit], %w[uVal]\n\t"
8357# else
8358 "clz %[iBit], %[uVal]\n\t"
8359# endif
8360 : [iBit] "=r" (iBit)
8361 : [uVal] "r" (u32));
8362 iBit = 32 - iBit;
8363
8364# else
8365# error "Port me"
8366# endif
8367 return iBit;
8368}
8369#endif
8370
8371
8372/**
8373 * Finds the last bit which is set in the given 32-bit integer.
8374 * Bits are numbered from 1 (least significant) to 32.
8375 *
8376 * @returns index [1..32] of the last set bit.
8377 * @returns 0 if all bits are cleared.
8378 * @param i32 Integer to search for set bits.
8379 * @remark Similar to fls() in BSD.
8380 */
8381DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
8382{
8383 return ASMBitLastSetU32((uint32_t)i32);
8384}
8385
8386
8387/**
8388 * Finds the last bit which is set in the given 64-bit integer.
8389 *
8390 * Bits are numbered from 1 (least significant) to 64.
8391 *
8392 * @returns index [1..64] of the last set bit.
8393 * @returns 0 if all bits are cleared.
8394 * @param u64 Integer to search for set bits.
8395 * @remark Similar to fls() in BSD.
8396 */
8397#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8398RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8399#else
8400DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
8401{
8402# if RT_INLINE_ASM_USES_INTRIN
8403 unsigned long iBit;
8404# if ARCH_BITS == 64
8405 if (_BitScanReverse64(&iBit, u64))
8406 iBit++;
8407 else
8408 iBit = 0;
8409# else
8410 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8411 iBit += 33;
8412 else if (_BitScanReverse(&iBit, (uint32_t)u64))
8413 iBit++;
8414 else
8415 iBit = 0;
8416# endif
8417
8418# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8419 uint64_t iBit;
8420 __asm__ __volatile__("bsrq %1, %0\n\t"
8421 "jnz 1f\n\t"
8422 "xorl %k0, %k0\n\t"
8423 "jmp 2f\n"
8424 "1:\n\t"
8425 "incl %k0\n"
8426 "2:\n\t"
8427 : "=r" (iBit)
8428 : "rm" (u64)
8429 : "cc");
8430
8431# elif defined(RT_ARCH_ARM64)
8432 uint64_t iBit;
8433 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8434 : [iBit] "=r" (iBit)
8435 : [uVal] "r" (u64));
8436 iBit = 64 - iBit;
8437
8438# else
8439 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
8440 if (iBit)
8441 iBit += 32;
8442 else
8443 iBit = ASMBitLastSetU32((uint32_t)u64);
8444# endif
8445 return (unsigned)iBit;
8446}
8447#endif
8448
8449
8450/**
8451 * Finds the last bit which is set in the given 16-bit integer.
8452 *
8453 * Bits are numbered from 1 (least significant) to 16.
8454 *
8455 * @returns index [1..16] of the last set bit.
8456 * @returns 0 if all bits are cleared.
8457 * @param u16 Integer to search for set bits.
8458 * @remarks For 16-bit bs3kit code.
8459 */
8460#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8461RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8462#else
8463DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
8464{
8465 return ASMBitLastSetU32((uint32_t)u16);
8466}
8467#endif
8468
8469
8470/**
8471 * Count the number of leading zero bits in the given 32-bit integer.
8472 *
8473 * The counting starts with the most significate bit.
8474 *
8475 * @returns Number of most significant zero bits.
8476 * @returns 32 if all bits are cleared.
8477 * @param u32 Integer to consider.
8478 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
8479 */
8480#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8481RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8482#else
8483DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8484{
8485# if RT_INLINE_ASM_USES_INTRIN
8486 unsigned long iBit;
8487 if (!_BitScanReverse(&iBit, u32))
8488 return 32;
8489 return 31 - (unsigned)iBit;
8490
8491# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8492 uint32_t iBit;
8493# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
8494 __asm__ __volatile__("bsrl %1, %0\n\t"
8495 "cmovzl %2, %0\n\t"
8496 : "=&r" (iBit)
8497 : "rm" (u32)
8498 , "rm" ((int32_t)-1)
8499 : "cc");
8500# elif RT_INLINE_ASM_GNU_STYLE
8501 __asm__ __volatile__("bsr %1, %0\n\t"
8502 "jnz 1f\n\t"
8503 "mov $-1, %0\n\t"
8504 "1:\n\t"
8505 : "=r" (iBit)
8506 : "rm" (u32)
8507 : "cc");
8508# else
8509 _asm
8510 {
8511 bsr eax, [u32]
8512 jnz found
8513 mov eax, -1
8514 found:
8515 mov [iBit], eax
8516 }
8517# endif
8518 return 31 - iBit;
8519
8520# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8521 uint32_t iBit;
8522 __asm__ __volatile__(
8523# if defined(RT_ARCH_ARM64)
8524 "clz %w[iBit], %w[uVal]\n\t"
8525# else
8526 "clz %[iBit], %[uVal]\n\t"
8527# endif
8528 : [uVal] "=r" (u32)
8529 , [iBit] "=r" (iBit)
8530 : "[uVal]" (u32));
8531 return iBit;
8532
8533# elif defined(__GNUC__)
8534 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8535 return u32 ? __builtin_clz(u32) : 32;
8536
8537# else
8538# error "Port me"
8539# endif
8540}
8541#endif
8542
8543
8544/**
8545 * Count the number of leading zero bits in the given 64-bit integer.
8546 *
8547 * The counting starts with the most significate bit.
8548 *
8549 * @returns Number of most significant zero bits.
8550 * @returns 64 if all bits are cleared.
8551 * @param u64 Integer to consider.
8552 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
8553 * result.
8554 */
8555#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8556RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8557#else
8558DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8559{
8560# if RT_INLINE_ASM_USES_INTRIN
8561 unsigned long iBit;
8562# if ARCH_BITS == 64
8563 if (_BitScanReverse64(&iBit, u64))
8564 return 63 - (unsigned)iBit;
8565# else
8566 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8567 return 31 - (unsigned)iBit;
8568 if (_BitScanReverse(&iBit, (uint32_t)u64))
8569 return 63 - (unsigned)iBit;
8570# endif
8571 return 64;
8572
8573# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8574 uint64_t iBit;
8575# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8576 __asm__ __volatile__("bsrq %1, %0\n\t"
8577 "cmovzq %2, %0\n\t"
8578 : "=&r" (iBit)
8579 : "rm" (u64)
8580 , "rm" ((int64_t)-1)
8581 : "cc");
8582# else /* 10980xe benchmark: 262 ps/call */
8583 __asm__ __volatile__("bsrq %1, %0\n\t"
8584 "jnz 1f\n\t"
8585 "mov $-1, %0\n\t"
8586 "1:\n\t"
8587 : "=&r" (iBit)
8588 : "rm" (u64)
8589 : "cc");
8590# endif
8591 return 63 - (unsigned)iBit;
8592
8593# elif defined(RT_ARCH_ARM64)
8594 uint64_t iBit;
8595 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8596 : [uVal] "=r" (u64)
8597 , [iBit] "=r" (iBit)
8598 : "[uVal]" (u64));
8599 return (unsigned)iBit;
8600
8601# elif defined(__GNUC__) && ARCH_BITS == 64
8602 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8603 return u64 ? __builtin_clzl(u64) : 64;
8604
8605# else
8606 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
8607 if (iBit == 32)
8608 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
8609 return iBit;
8610# endif
8611}
8612#endif
8613
8614
8615/**
8616 * Count the number of leading zero bits in the given 16-bit integer.
8617 *
8618 * The counting starts with the most significate bit.
8619 *
8620 * @returns Number of most significant zero bits.
8621 * @returns 16 if all bits are cleared.
8622 * @param u16 Integer to consider.
8623 */
8624#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8625RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8626#else
8627DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8628{
8629# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
8630 uint16_t iBit;
8631 __asm__ __volatile__("bsrw %1, %0\n\t"
8632 "jnz 1f\n\t"
8633 "mov $-1, %0\n\t"
8634 "1:\n\t"
8635 : "=r" (iBit)
8636 : "rm" (u16)
8637 : "cc");
8638 return 15 - (int16_t)iBit;
8639# else
8640 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
8641# endif
8642}
8643#endif
8644
8645
8646/**
8647 * Count the number of trailing zero bits in the given 32-bit integer.
8648 *
8649 * The counting starts with the least significate bit, i.e. the zero bit.
8650 *
8651 * @returns Number of least significant zero bits.
8652 * @returns 32 if all bits are cleared.
8653 * @param u32 Integer to consider.
8654 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8655 */
8656#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8657RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8658#else
8659DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8660{
8661# if RT_INLINE_ASM_USES_INTRIN
8662 unsigned long iBit;
8663 if (!_BitScanForward(&iBit, u32))
8664 return 32;
8665 return (unsigned)iBit;
8666
8667# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8668 uint32_t iBit;
8669# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8670 __asm__ __volatile__("bsfl %1, %0\n\t"
8671 "cmovzl %2, %0\n\t"
8672 : "=&r" (iBit)
8673 : "rm" (u32)
8674 , "rm" ((int32_t)32)
8675 : "cc");
8676# elif RT_INLINE_ASM_GNU_STYLE
8677 __asm__ __volatile__("bsfl %1, %0\n\t"
8678 "jnz 1f\n\t"
8679 "mov $32, %0\n\t"
8680 "1:\n\t"
8681 : "=r" (iBit)
8682 : "rm" (u32)
8683 : "cc");
8684# else
8685 _asm
8686 {
8687 bsf eax, [u32]
8688 jnz found
8689 mov eax, 32
8690 found:
8691 mov [iBit], eax
8692 }
8693# endif
8694 return iBit;
8695
8696# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8697 /* Invert the bits and use clz. */
8698 uint32_t iBit;
8699 __asm__ __volatile__(
8700# if defined(RT_ARCH_ARM64)
8701 "rbit %w[uVal], %w[uVal]\n\t"
8702 "clz %w[iBit], %w[uVal]\n\t"
8703# else
8704 "rbit %[uVal], %[uVal]\n\t"
8705 "clz %[iBit], %[uVal]\n\t"
8706# endif
8707 : [uVal] "=r" (u32)
8708 , [iBit] "=r" (iBit)
8709 : "[uVal]" (u32));
8710 return iBit;
8711
8712# elif defined(__GNUC__)
8713 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8714 return u32 ? __builtin_ctz(u32) : 32;
8715
8716# else
8717# error "Port me"
8718# endif
8719}
8720#endif
8721
8722
8723/**
8724 * Count the number of trailing zero bits in the given 64-bit integer.
8725 *
8726 * The counting starts with the least significate bit.
8727 *
8728 * @returns Number of least significant zero bits.
8729 * @returns 64 if all bits are cleared.
8730 * @param u64 Integer to consider.
8731 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8732 * result.
8733 */
8734#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8735RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8736#else
8737DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8738{
8739# if RT_INLINE_ASM_USES_INTRIN
8740 unsigned long iBit;
8741# if ARCH_BITS == 64
8742 if (_BitScanForward64(&iBit, u64))
8743 return (unsigned)iBit;
8744# else
8745 if (_BitScanForward(&iBit, (uint32_t)u64))
8746 return (unsigned)iBit;
8747 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8748 return (unsigned)iBit + 32;
8749# endif
8750 return 64;
8751
8752# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8753 uint64_t iBit;
8754# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8755 __asm__ __volatile__("bsfq %1, %0\n\t"
8756 "cmovzq %2, %0\n\t"
8757 : "=&r" (iBit)
8758 : "rm" (u64)
8759 , "rm" ((int64_t)64)
8760 : "cc");
8761# else /* 10980xe benchmark: 262 ps/call */
8762 __asm__ __volatile__("bsfq %1, %0\n\t"
8763 "jnz 1f\n\t"
8764 "mov $64, %0\n\t"
8765 "1:\n\t"
8766 : "=&r" (iBit)
8767 : "rm" (u64)
8768 : "cc");
8769# endif
8770 return (unsigned)iBit;
8771
8772# elif defined(RT_ARCH_ARM64)
8773 /* Invert the bits and use clz. */
8774 uint64_t iBit;
8775 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8776 "clz %[iBit], %[uVal]\n\t"
8777 : [uVal] "=r" (u64)
8778 , [iBit] "=r" (iBit)
8779 : "[uVal]" (u64));
8780 return (unsigned)iBit;
8781
8782# elif defined(__GNUC__) && ARCH_BITS == 64
8783 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8784 return u64 ? __builtin_ctzl(u64) : 64;
8785
8786# else
8787 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8788 if (iBit == 32)
8789 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8790 return iBit;
8791# endif
8792}
8793#endif
8794
8795
8796/**
8797 * Count the number of trailing zero bits in the given 16-bit integer.
8798 *
8799 * The counting starts with the most significate bit.
8800 *
8801 * @returns Number of most significant zero bits.
8802 * @returns 16 if all bits are cleared.
8803 * @param u16 Integer to consider.
8804 */
8805#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8806RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8807#else
8808DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8809{
8810# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8811 uint16_t iBit;
8812 __asm__ __volatile__("bsfw %1, %0\n\t"
8813 "jnz 1f\n\t"
8814 "mov $16, %0\n\t"
8815 "1:\n\t"
8816 : "=r" (iBit)
8817 : "rm" (u16)
8818 : "cc");
8819 return iBit;
8820# else
8821 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8822#endif
8823}
8824#endif
8825
8826
8827/**
8828 * Rotate 32-bit unsigned value to the left by @a cShift.
8829 *
8830 * @returns Rotated value.
8831 * @param u32 The value to rotate.
8832 * @param cShift How many bits to rotate by.
8833 */
8834#ifdef __WATCOMC__
8835RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8836#else
8837DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8838{
8839# if RT_INLINE_ASM_USES_INTRIN
8840 return _rotl(u32, cShift);
8841
8842# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8843 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8844 return u32;
8845
8846# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8847 __asm__ __volatile__(
8848# if defined(RT_ARCH_ARM64)
8849 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8850# else
8851 "ror %[uRet], %[uVal], %[cShift]\n\t"
8852# endif
8853 : [uRet] "=r" (u32)
8854 : [uVal] "[uRet]" (u32)
8855 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8856 return u32;
8857
8858# else
8859 cShift &= 31;
8860 return (u32 << cShift) | (u32 >> (32 - cShift));
8861# endif
8862}
8863#endif
8864
8865
8866/**
8867 * Rotate 32-bit unsigned value to the right by @a cShift.
8868 *
8869 * @returns Rotated value.
8870 * @param u32 The value to rotate.
8871 * @param cShift How many bits to rotate by.
8872 */
8873#ifdef __WATCOMC__
8874RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8875#else
8876DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8877{
8878# if RT_INLINE_ASM_USES_INTRIN
8879 return _rotr(u32, cShift);
8880
8881# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8882 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8883 return u32;
8884
8885# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8886 __asm__ __volatile__(
8887# if defined(RT_ARCH_ARM64)
8888 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8889# else
8890 "ror %[uRet], %[uVal], %[cShift]\n\t"
8891# endif
8892 : [uRet] "=r" (u32)
8893 : [uVal] "[uRet]" (u32)
8894 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8895 return u32;
8896
8897# else
8898 cShift &= 31;
8899 return (u32 >> cShift) | (u32 << (32 - cShift));
8900# endif
8901}
8902#endif
8903
8904
8905/**
8906 * Rotate 64-bit unsigned value to the left by @a cShift.
8907 *
8908 * @returns Rotated value.
8909 * @param u64 The value to rotate.
8910 * @param cShift How many bits to rotate by.
8911 */
8912DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8913{
8914#if RT_INLINE_ASM_USES_INTRIN
8915 return _rotl64(u64, cShift);
8916
8917#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8918 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8919 return u64;
8920
8921#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8922 uint32_t uSpill;
8923 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8924 "jz 1f\n\t"
8925 "xchgl %%eax, %%edx\n\t"
8926 "1:\n\t"
8927 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8928 "jz 2f\n\t"
8929 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8930 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8931 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8932 "2:\n\t" /* } */
8933 : "=A" (u64)
8934 , "=c" (cShift)
8935 , "=r" (uSpill)
8936 : "0" (u64)
8937 , "1" (cShift)
8938 : "cc");
8939 return u64;
8940
8941# elif defined(RT_ARCH_ARM64)
8942 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8943 : [uRet] "=r" (u64)
8944 : [uVal] "[uRet]" (u64)
8945 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8946 return u64;
8947
8948#else
8949 cShift &= 63;
8950 return (u64 << cShift) | (u64 >> (64 - cShift));
8951#endif
8952}
8953
8954
8955/**
8956 * Rotate 64-bit unsigned value to the right by @a cShift.
8957 *
8958 * @returns Rotated value.
8959 * @param u64 The value to rotate.
8960 * @param cShift How many bits to rotate by.
8961 */
8962DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8963{
8964#if RT_INLINE_ASM_USES_INTRIN
8965 return _rotr64(u64, cShift);
8966
8967#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8968 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8969 return u64;
8970
8971#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8972 uint32_t uSpill;
8973 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8974 "jz 1f\n\t"
8975 "xchgl %%eax, %%edx\n\t"
8976 "1:\n\t"
8977 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8978 "jz 2f\n\t"
8979 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8980 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
8981 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
8982 "2:\n\t" /* } */
8983 : "=A" (u64)
8984 , "=c" (cShift)
8985 , "=r" (uSpill)
8986 : "0" (u64)
8987 , "1" (cShift)
8988 : "cc");
8989 return u64;
8990
8991# elif defined(RT_ARCH_ARM64)
8992 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8993 : [uRet] "=r" (u64)
8994 : [uVal] "[uRet]" (u64)
8995 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
8996 return u64;
8997
8998#else
8999 cShift &= 63;
9000 return (u64 >> cShift) | (u64 << (64 - cShift));
9001#endif
9002}
9003
9004/** @} */
9005
9006
9007/** @} */
9008
9009/*
9010 * Include #pragma aux definitions for Watcom C/C++.
9011 */
9012#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
9013# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
9014# undef IPRT_INCLUDED_asm_watcom_x86_16_h
9015# include "asm-watcom-x86-16.h"
9016#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
9017# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
9018# undef IPRT_INCLUDED_asm_watcom_x86_32_h
9019# include "asm-watcom-x86-32.h"
9020#endif
9021
9022#endif /* !IPRT_INCLUDED_asm_h */
9023
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette