VirtualBox

source: vbox/trunk/include/iprt/asm.h@ 106920

Last change on this file since 106920 was 106607, checked in by vboxsync, 3 months ago

iprt/asm.h: Adjustments of the win/arm64 changes - RT_INLINE_ASM_USES_INTRIN is always defined and we must test the value it has. [build fix] bugref:10392

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 297.9 KB
Line 
1/** @file
2 * IPRT - Assembly Functions.
3 */
4
5/*
6 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
7 *
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
14 * License.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
23 *
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
29 *
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
32 *
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
34 */
35
36#ifndef IPRT_INCLUDED_asm_h
37#define IPRT_INCLUDED_asm_h
38#ifndef RT_WITHOUT_PRAGMA_ONCE
39# pragma once
40#endif
41
42#include <iprt/cdefs.h>
43#include <iprt/types.h>
44#include <iprt/assert.h>
45/** @def RT_INLINE_ASM_USES_INTRIN
46 * Defined as 1 if we're using a _MSC_VER 1400.
47 * Otherwise defined as 0.
48 */
49
50/* Solaris 10 header ugliness */
51#ifdef u
52# undef u
53#endif
54
55#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
56/* Emit the intrinsics at all optimization levels. */
57# include <iprt/sanitized/intrin.h>
58# pragma intrinsic(_ReadWriteBarrier)
59# if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
60# pragma intrinsic(__cpuid)
61# pragma intrinsic(__stosd)
62# pragma intrinsic(__stosw)
63# pragma intrinsic(__stosb)
64# ifdef RT_ARCH_AMD64
65# pragma intrinsic(__stosq)
66# pragma intrinsic(_byteswap_uint64)
67# pragma intrinsic(_InterlockedCompareExchange128)
68# pragma intrinsic(_InterlockedExchange64)
69# pragma intrinsic(_InterlockedExchangeAdd64)
70# pragma intrinsic(_InterlockedAnd64)
71# pragma intrinsic(_InterlockedOr64)
72# pragma intrinsic(_InterlockedIncrement64)
73# pragma intrinsic(_InterlockedDecrement64)
74# endif
75# elif defined(RT_ARCH_ARM64)
76# pragma intrinsic(__break)
77# pragma intrinsic(__dmb)
78# pragma intrinsic(__dsb)
79# pragma intrinsic(__isb)
80# pragma intrinsic(__nop)
81# pragma intrinsic(__yield)
82# pragma intrinsic(__swp8)
83# pragma intrinsic(__swpa8)
84# pragma intrinsic(__swpal8)
85# pragma intrinsic(__swp16)
86# pragma intrinsic(__swpa16)
87# pragma intrinsic(__swpal16)
88# pragma intrinsic(__swp32)
89# pragma intrinsic(__swpa32)
90# pragma intrinsic(__swpal32)
91# pragma intrinsic(__swp64)
92# pragma intrinsic(__swpa64)
93# pragma intrinsic(__swpal64)
94# pragma intrinsic(__cas8)
95# pragma intrinsic(__casl8)
96# pragma intrinsic(__cas16)
97# pragma intrinsic(__casl16)
98# pragma intrinsic(__cas32)
99# pragma intrinsic(__casl32)
100# pragma intrinsic(__cas64)
101# pragma intrinsic(__casl64)
102# pragma intrinsic(__casa8)
103# pragma intrinsic(__casal8)
104# pragma intrinsic(__casa16)
105# pragma intrinsic(__casa64)
106# pragma intrinsic(__iso_volatile_load8)
107# pragma intrinsic(__iso_volatile_load16)
108# pragma intrinsic(__iso_volatile_load32)
109# pragma intrinsic(__iso_volatile_load64)
110# pragma intrinsic(__iso_volatile_store8)
111# pragma intrinsic(__iso_volatile_store16)
112# pragma intrinsic(__iso_volatile_store32)
113# pragma intrinsic(__iso_volatile_store64)
114# pragma intrinsic(__load_acquire8)
115# pragma intrinsic(__load_acquire16)
116# pragma intrinsic(__load_acquire32)
117# pragma intrinsic(__load_acquire64)
118# pragma intrinsic(__stlr8)
119# pragma intrinsic(__stlr16)
120# pragma intrinsic(__stlr32)
121# pragma intrinsic(__stlr64)
122# else
123# error "Port me"
124# endif
125# pragma intrinsic(_BitScanForward)
126# pragma intrinsic(_BitScanReverse)
127# pragma intrinsic(_bittest)
128# pragma intrinsic(_bittestandset)
129# pragma intrinsic(_bittestandreset)
130# pragma intrinsic(_bittestandcomplement)
131# pragma intrinsic(_byteswap_ushort)
132# pragma intrinsic(_byteswap_ulong)
133# pragma intrinsic(_interlockedbittestandset)
134# pragma intrinsic(_interlockedbittestandreset)
135# pragma intrinsic(_InterlockedAnd)
136# pragma intrinsic(_InterlockedOr)
137# pragma intrinsic(_InterlockedXor)
138# pragma intrinsic(_InterlockedIncrement)
139# pragma intrinsic(_InterlockedDecrement)
140# pragma intrinsic(_InterlockedExchange)
141# pragma intrinsic(_InterlockedExchangeAdd)
142# pragma intrinsic(_InterlockedCompareExchange)
143# pragma intrinsic(_InterlockedCompareExchange8)
144# pragma intrinsic(_InterlockedCompareExchange16)
145# pragma intrinsic(_InterlockedCompareExchange64)
146# pragma intrinsic(_rotl)
147# pragma intrinsic(_rotr)
148# pragma intrinsic(_rotl64)
149# pragma intrinsic(_rotr64)
150#endif
151
152#if (defined(RT_ARCH_ARM64) && (defined(RT_OS_DARWIN) || defined(RT_OS_WINDOWS))) || defined(DOXYGEN_RUNNING)
153/** @def RTASM_ARM64_USE_FEAT_LSE
154 * Use instructions from the FEAT_LSE set to implement atomic operations,
155 * assuming that the host CPU always supports these. */
156# define RTASM_ARM64_USE_FEAT_LSE 1
157/** @def RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB
158 * Set to use DMB w/o barrier in most places and rely on the acquire-release
159 * aspects to do the serializing. The assumption is that the tstRTInline
160 * benchmark may be skewing the results testing an unusual scenario. */
161# define RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB 1
162#endif
163
164
165/*
166 * Undefine all symbols we have Watcom C/C++ #pragma aux'es for.
167 */
168#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
169# include "asm-watcom-x86-16.h"
170#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
171# include "asm-watcom-x86-32.h"
172#endif
173
174
175/** @defgroup grp_rt_asm ASM - Assembly Routines
176 * @ingroup grp_rt
177 *
178 * @remarks The difference between ordered and unordered atomic operations are
179 * that the former will complete outstanding reads and writes before
180 * continuing while the latter doesn't make any promises about the
181 * order. Ordered operations doesn't, it seems, make any 100% promise
182 * wrt to whether the operation will complete before any subsequent
183 * memory access. (please, correct if wrong.)
184 *
185 * ASMAtomicSomething operations are all ordered, while
186 * ASMAtomicUoSomething are unordered (note the Uo).
187 *
188 * Please note that ordered operations does not necessarily imply a
189 * compiler (memory) barrier. The user has to use the
190 * ASMCompilerBarrier() macro when that is deemed necessary.
191 *
192 * @remarks Some remarks about __volatile__: Without this keyword gcc is allowed
193 * to reorder or even optimize assembler instructions away. For
194 * instance, in the following code the second rdmsr instruction is
195 * optimized away because gcc treats that instruction as deterministic:
196 *
197 * @code
198 * static inline uint64_t rdmsr_low(int idx)
199 * {
200 * uint32_t low;
201 * __asm__ ("rdmsr" : "=a"(low) : "c"(idx) : "edx");
202 * }
203 * ...
204 * uint32_t msr1 = rdmsr_low(1);
205 * foo(msr1);
206 * msr1 = rdmsr_low(1);
207 * bar(msr1);
208 * @endcode
209 *
210 * The input parameter of rdmsr_low is the same for both calls and
211 * therefore gcc will use the result of the first call as input
212 * parameter for bar() as well. For rdmsr this is not acceptable as
213 * this instruction is _not_ deterministic. This applies to reading
214 * machine status information in general.
215 *
216 * @{
217 */
218
219
220/** @def RT_INLINE_ASM_GCC_4_3_X_X86
221 * Used to work around some 4.3.x register allocation issues in this version of
222 * the compiler. So far this workaround is still required for 4.4 and 4.5 but
223 * definitely not for 5.x */
224#if (RT_GNUC_PREREQ(4, 3) && !RT_GNUC_PREREQ(5, 0) && defined(__i386__))
225# define RT_INLINE_ASM_GCC_4_3_X_X86 1
226#else
227# define RT_INLINE_ASM_GCC_4_3_X_X86 0
228#endif
229
230/** @def RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
231 * i686-apple-darwin9-gcc-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5493) screws up
232 * RTSemRWRequestWrite semsemrw-lockless-generic.cpp in release builds. PIC
233 * mode, x86.
234 *
235 * Some gcc 4.3.x versions may have register allocation issues with cmpxchg8b
236 * when in PIC mode on x86.
237 */
238#ifndef RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
239# if defined(DOXYGEN_RUNNING) || defined(__WATCOMC__) /* Watcom has trouble with the expression below */
240# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
241# elif defined(_MSC_VER) /* Visual C++ has trouble too, but it'll only tell us when C4688 is enabled. */
242# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
243# elif ( (defined(PIC) || defined(__PIC__)) \
244 && defined(RT_ARCH_X86) \
245 && ( RT_INLINE_ASM_GCC_4_3_X_X86 \
246 || defined(RT_OS_DARWIN)) )
247# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 1
248# else
249# define RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC 0
250# endif
251#endif
252
253
254/*
255 * ARM is great fun.
256 */
257#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
258
259# define RTASM_ARM_NO_BARRIER
260# ifdef RT_ARCH_ARM64
261# define RTASM_ARM_NO_BARRIER_IN_REG
262# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
263# define RTASM_ARM_DSB_SY "dsb sy\n\t"
264# define RTASM_ARM_DSB_SY_IN_REG
265# define RTASM_ARM_DSB_SY_COMMA_IN_REG
266# define RTASM_ARM_DMB_SY "dmb sy\n\t"
267# define RTASM_ARM_DMB_SY_IN_REG
268# define RTASM_ARM_DMB_SY_COMMA_IN_REG
269# define RTASM_ARM_DMB_ST "dmb st\n\t"
270# define RTASM_ARM_DMB_ST_IN_REG
271# define RTASM_ARM_DMB_ST_COMMA_IN_REG
272# define RTASM_ARM_DMB_LD "dmb ld\n\t"
273# define RTASM_ARM_DMB_LD_IN_REG
274# define RTASM_ARM_DMB_LD_COMMA_IN_REG
275# define RTASM_ARM_PICK_6432(expr64, expr32) expr64
276# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
277 uint32_t rcSpill; \
278 uint32_t u32NewRet; \
279 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
280 RTASM_ARM_##barrier_type /* before lable? */ \
281 "ldaxr %w[uNew], %[pMem]\n\t" \
282 modify64 \
283 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
284 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
285 : [pMem] "+Q" (*a_pu32Mem) \
286 , [uNew] "=&r" (u32NewRet) \
287 , [rc] "=&r" (rcSpill) \
288 : in_reg \
289 : "cc")
290# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
291 uint32_t rcSpill; \
292 uint32_t u32OldRet; \
293 uint32_t u32NewSpill; \
294 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
295 RTASM_ARM_##barrier_type /* before lable? */ \
296 "ldaxr %w[uOld], %[pMem]\n\t" \
297 modify64 \
298 "stlxr %w[rc], %w[uNew], %[pMem]\n\t" \
299 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
300 : [pMem] "+Q" (*a_pu32Mem) \
301 , [uOld] "=&r" (u32OldRet) \
302 , [uNew] "=&r" (u32NewSpill) \
303 , [rc] "=&r" (rcSpill) \
304 : in_reg \
305 : "cc")
306# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
307 uint32_t rcSpill; \
308 uint64_t u64NewRet; \
309 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
310 RTASM_ARM_##barrier_type /* before lable? */ \
311 "ldaxr %[uNew], %[pMem]\n\t" \
312 modify64 \
313 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
314 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
315 : [pMem] "+Q" (*a_pu64Mem) \
316 , [uNew] "=&r" (u64NewRet) \
317 , [rc] "=&r" (rcSpill) \
318 : in_reg \
319 : "cc")
320# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
321 uint32_t rcSpill; \
322 uint64_t u64OldRet; \
323 uint64_t u64NewSpill; \
324 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
325 RTASM_ARM_##barrier_type /* before lable? */ \
326 "ldaxr %[uOld], %[pMem]\n\t" \
327 modify64 \
328 "stlxr %w[rc], %[uNew], %[pMem]\n\t" \
329 "cbnz %w[rc], Ltry_again_" #name "_%=\n\t" \
330 : [pMem] "+Q" (*a_pu64Mem) \
331 , [uOld] "=&r" (u64OldRet) \
332 , [uNew] "=&r" (u64NewSpill) \
333 , [rc] "=&r" (rcSpill) \
334 : in_reg \
335 : "cc")
336
337# else /* RT_ARCH_ARM32 */
338# define RTASM_ARM_PICK_6432(expr64, expr32) expr32
339# if RT_ARCH_ARM32 >= 7
340# warning armv7
341# define RTASM_ARM_NO_BARRIER_IN_REG
342# define RTASM_ARM_NO_BARRIER_COMMA_IN_REG
343# define RTASM_ARM_DSB_SY "dsb sy\n\t"
344# define RTASM_ARM_DSB_SY_IN_REG "X" (0xfade)
345# define RTASM_ARM_DMB_SY "dmb sy\n\t"
346# define RTASM_ARM_DMB_SY_IN_REG "X" (0xfade)
347# define RTASM_ARM_DMB_ST "dmb st\n\t"
348# define RTASM_ARM_DMB_ST_IN_REG "X" (0xfade)
349# define RTASM_ARM_DMB_LD "dmb ld\n\t"
350# define RTASM_ARM_DMB_LD_IN_REG "X" (0xfade)
351
352# elif RT_ARCH_ARM32 >= 6
353# warning armv6
354# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
355# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
356# define RTASM_ARM_DMB_SY "mcr p15, 0, %[uZero], c7, c10, 5\n\t"
357# define RTASM_ARM_DMB_SY_IN_REG [uZero] "r" (0)
358# define RTASM_ARM_DMB_ST RTASM_ARM_DMB_SY
359# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DMB_SY_IN_REG
360# define RTASM_ARM_DMB_LD RTASM_ARM_DMB_SY
361# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DMB_SY_IN_REG
362
363# elif RT_ARCH_ARM32 >= 4
364# warning armv5 or older
365# define RTASM_ARM_DSB_SY "mcr p15, 0, %[uZero], c7, c10, 4\n\t"
366# define RTASM_ARM_DSB_SY_IN_REG [uZero] "r" (0)
367# define RTASM_ARM_DMB_SY RTASM_ARM_DSB_SY
368# define RTASM_ARM_DMB_SY_IN_REG RTASM_ARM_DSB_SY_IN_REG
369# define RTASM_ARM_DMB_ST RTASM_ARM_DSB_SY
370# define RTASM_ARM_DMB_ST_IN_REG RTASM_ARM_DSB_SY_IN_REG
371# define RTASM_ARM_DMB_LD RTASM_ARM_DSB_SY
372# define RTASM_ARM_DMB_LD_IN_REG RTASM_ARM_DSB_SY_IN_REG
373# else
374# error "huh? Odd RT_ARCH_ARM32 value!"
375# endif
376# define RTASM_ARM_DSB_SY_COMMA_IN_REG , RTASM_ARM_DSB_SY_IN_REG
377# define RTASM_ARM_DMB_SY_COMMA_IN_REG , RTASM_ARM_DMB_SY_IN_REG
378# define RTASM_ARM_DMB_ST_COMMA_IN_REG , RTASM_ARM_DMB_ST_IN_REG
379# define RTASM_ARM_DMB_LD_COMMA_IN_REG , RTASM_ARM_DMB_LD_IN_REG
380# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
381 uint32_t rcSpill; \
382 uint32_t u32NewRet; \
383 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
384 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
385 "ldrex %[uNew], %[pMem]\n\t" \
386 modify32 \
387 "strex %[rc], %[uNew], %[pMem]\n\t" \
388 "cmp %[rc], #0\n\t" \
389 "bne Ltry_again_" #name "_%=\n\t" \
390 : [pMem] "+m" (*a_pu32Mem) \
391 , [uNew] "=&r" (u32NewRet) \
392 , [rc] "=&r" (rcSpill) \
393 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
394 , in_reg \
395 : "cc")
396# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(name, a_pu32Mem, barrier_type, modify64, modify32, in_reg) \
397 uint32_t rcSpill; \
398 uint32_t u32OldRet; \
399 uint32_t u32NewSpill; \
400 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
401 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
402 "ldrex %[uOld], %[pMem]\n\t" \
403 modify32 \
404 "strex %[rc], %[uNew], %[pMem]\n\t" \
405 "cmp %[rc], #0\n\t" \
406 "bne Ltry_again_" #name "_%=\n\t" \
407 : [pMem] "+m" (*a_pu32Mem) \
408 , [uOld] "=&r" (u32OldRet) \
409 , [uNew] "=&r" (u32NewSpill) \
410 , [rc] "=&r" (rcSpill) \
411 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
412 , in_reg \
413 : "cc")
414# define RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
415 uint32_t rcSpill; \
416 uint64_t u64NewRet; \
417 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
418 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
419 "ldrexd %[uNew], %H[uNew], %[pMem]\n\t" \
420 modify32 \
421 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
422 "cmp %[rc], #0\n\t" \
423 "bne Ltry_again_" #name "_%=\n\t" \
424 : [pMem] "+m" (*a_pu64Mem), \
425 [uNew] "=&r" (u64NewRet), \
426 [rc] "=&r" (rcSpill) \
427 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
428 , in_reg \
429 : "cc")
430# define RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(name, a_pu64Mem, barrier_type, modify64, modify32, in_reg) \
431 uint32_t rcSpill; \
432 uint64_t u64OldRet; \
433 uint64_t u64NewSpill; \
434 __asm__ __volatile__("Ltry_again_" #name "_%=:\n\t" \
435 RT_CONCAT(RTASM_ARM_,barrier_type) /* before lable? */ \
436 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" \
437 modify32 \
438 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t" \
439 "cmp %[rc], #0\n\t" \
440 "bne Ltry_again_" #name "_%=\n\t" \
441 : [pMem] "+m" (*a_pu64Mem), \
442 [uOld] "=&r" (u64OldRet), \
443 [uNew] "=&r" (u64NewSpill), \
444 [rc] "=&r" (rcSpill) \
445 : RT_CONCAT3(RTASM_ARM_,barrier_type,_IN_REG) \
446 , in_reg \
447 : "cc")
448# endif /* RT_ARCH_ARM32 */
449#endif
450
451
452/** @def ASMReturnAddress
453 * Gets the return address of the current (or calling if you like) function or method.
454 */
455#ifdef _MSC_VER
456# ifdef __cplusplus
457extern "C"
458# endif
459void * _ReturnAddress(void);
460# pragma intrinsic(_ReturnAddress)
461# define ASMReturnAddress() _ReturnAddress()
462#elif defined(__GNUC__) || defined(DOXYGEN_RUNNING)
463# define ASMReturnAddress() __builtin_return_address(0)
464#elif defined(__WATCOMC__)
465# define ASMReturnAddress() Watcom_does_not_appear_to_have_intrinsic_return_address_function()
466#else
467# error "Unsupported compiler."
468#endif
469
470
471/**
472 * Compiler memory barrier.
473 *
474 * Ensure that the compiler does not use any cached (register/tmp stack) memory
475 * values or any outstanding writes when returning from this function.
476 *
477 * This function must be used if non-volatile data is modified by a
478 * device or the VMM. Typical cases are port access, MMIO access,
479 * trapping instruction, etc.
480 */
481#if RT_INLINE_ASM_GNU_STYLE
482# define ASMCompilerBarrier() do { __asm__ __volatile__("" : : : "memory"); } while (0)
483#elif RT_INLINE_ASM_USES_INTRIN
484# define ASMCompilerBarrier() do { _ReadWriteBarrier(); } while (0)
485#elif defined(__WATCOMC__)
486void ASMCompilerBarrier(void);
487#else /* 2003 should have _ReadWriteBarrier() but I guess we're at 2002 level then... */
488DECLINLINE(void) ASMCompilerBarrier(void) RT_NOTHROW_DEF
489{
490 __asm
491 {
492 }
493}
494#endif
495
496
497/** @def ASMBreakpoint
498 * Debugger Breakpoint.
499 * @deprecated Use RT_BREAKPOINT instead.
500 * @internal
501 */
502#define ASMBreakpoint() RT_BREAKPOINT()
503
504
505/**
506 * Spinloop hint for platforms that have these, empty function on the other
507 * platforms.
508 *
509 * x86 & AMD64: The PAUSE variant of NOP for helping hyperthreaded CPUs detecting
510 * spin locks.
511 */
512#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
513RT_ASM_DECL_PRAGMA_WATCOM(void) ASMNopPause(void) RT_NOTHROW_PROTO;
514#else
515DECLINLINE(void) ASMNopPause(void) RT_NOTHROW_DEF
516{
517# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
518# if RT_INLINE_ASM_GNU_STYLE
519 __asm__ __volatile__(".byte 0xf3,0x90\n\t");
520# else
521 __asm {
522 _emit 0f3h
523 _emit 090h
524 }
525# endif
526
527# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
528# if RT_INLINE_ASM_USES_INTRIN
529 __yield();
530# else
531 __asm__ __volatile__("yield\n\t"); /* ARMv6K+ */
532# endif
533
534# else
535 /* dummy */
536# endif
537}
538#endif
539
540
541/**
542 * Atomically Exchange an unsigned 8-bit value, ordered.
543 *
544 * @returns Current *pu8 value
545 * @param pu8 Pointer to the 8-bit variable to update.
546 * @param u8 The 8-bit value to assign to *pu8.
547 */
548#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
549RT_ASM_DECL_PRAGMA_WATCOM(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_PROTO;
550#else
551DECLINLINE(uint8_t) ASMAtomicXchgU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
552{
553# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
554# if RT_INLINE_ASM_GNU_STYLE
555 __asm__ __volatile__("xchgb %0, %1\n\t"
556 : "=m" (*pu8)
557 , "=q" (u8) /* =r - busted on g++ (GCC) 3.4.4 20050721 (Red Hat 3.4.4-2) */
558 : "1" (u8)
559 , "m" (*pu8));
560# else
561 __asm
562 {
563# ifdef RT_ARCH_AMD64
564 mov rdx, [pu8]
565 mov al, [u8]
566 xchg [rdx], al
567 mov [u8], al
568# else
569 mov edx, [pu8]
570 mov al, [u8]
571 xchg [edx], al
572 mov [u8], al
573# endif
574 }
575# endif
576 return u8;
577
578# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
579# if RT_INLINE_ASM_USES_INTRIN
580# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
581 return __swpal8(pu8, u8);
582# else
583 uint8_t uOld = __swp8(pu8, u8);
584 __dmb(_ARM64_BARRIER_SY);
585 return uOld;
586# endif
587
588# else
589 uint32_t uOld;
590# if defined(RTASM_ARM64_USE_FEAT_LSE)
591 /* SWPALB is ~40% more expensive than the non-LSE variant (M1), but since we
592 have the barrier we shouldn't need that, right? Ordering should be taken
593 care of by the DMB. The SWPB is rather cheap (~70% faster). */
594 __asm__ __volatile__("Lstart_ASMAtomicXchgU8_%=:\n\t"
595# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
596 "swpalb %w[uNew], %w[uOld], %[pMem]\n\t"
597# else
598 RTASM_ARM_DMB_SY
599 "swpb %w[uNew], %w[uOld], %[pMem]\n\t"
600# endif
601 : [pMem] "+Q" (*pu8)
602 , [uOld] "=&r" (uOld)
603 : [uNew] "r" ((uint32_t)u8)
604 : );
605# else
606 uint32_t rcSpill;
607 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU8_%=:\n\t"
608 RTASM_ARM_DMB_SY
609# if defined(RT_ARCH_ARM64)
610 "ldaxrb %w[uOld], %[pMem]\n\t"
611 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
612 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU8_%=\n\t"
613# else
614 "ldrexb %[uOld], %[pMem]\n\t" /* ARMv6+ */
615 "strexb %[rc], %[uNew], %[pMem]\n\t"
616 "cmp %[rc], #0\n\t"
617 "bne Ltry_again_ASMAtomicXchgU8_%=\n\t"
618# endif
619 : [pMem] "+Q" (*pu8)
620 , [uOld] "=&r" (uOld)
621 , [rc] "=&r" (rcSpill)
622 : [uNew] "r" ((uint32_t)u8)
623 RTASM_ARM_DMB_SY_COMMA_IN_REG
624 : "cc");
625# endif
626 return (uint8_t)uOld;
627# endif
628
629# else
630# error "Port me"
631# endif
632}
633#endif
634
635
636/**
637 * Atomically Exchange a signed 8-bit value, ordered.
638 *
639 * @returns Current *pu8 value
640 * @param pi8 Pointer to the 8-bit variable to update.
641 * @param i8 The 8-bit value to assign to *pi8.
642 */
643DECLINLINE(int8_t) ASMAtomicXchgS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
644{
645 return (int8_t)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
646}
647
648
649/**
650 * Atomically Exchange a bool value, ordered.
651 *
652 * @returns Current *pf value
653 * @param pf Pointer to the 8-bit variable to update.
654 * @param f The 8-bit value to assign to *pi8.
655 */
656DECLINLINE(bool) ASMAtomicXchgBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
657{
658#ifdef _MSC_VER
659 return !!ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
660#else
661 return (bool)ASMAtomicXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)f);
662#endif
663}
664
665
666/**
667 * Atomically Exchange an unsigned 16-bit value, ordered.
668 *
669 * @returns Current *pu16 value
670 * @param pu16 Pointer to the 16-bit variable to update.
671 * @param u16 The 16-bit value to assign to *pu16.
672 */
673#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
674RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_PROTO;
675#else
676DECLINLINE(uint16_t) ASMAtomicXchgU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
677{
678# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
679# if RT_INLINE_ASM_GNU_STYLE
680 __asm__ __volatile__("xchgw %0, %1\n\t"
681 : "=m" (*pu16)
682 , "=r" (u16)
683 : "1" (u16)
684 , "m" (*pu16));
685# else
686 __asm
687 {
688# ifdef RT_ARCH_AMD64
689 mov rdx, [pu16]
690 mov ax, [u16]
691 xchg [rdx], ax
692 mov [u16], ax
693# else
694 mov edx, [pu16]
695 mov ax, [u16]
696 xchg [edx], ax
697 mov [u16], ax
698# endif
699 }
700# endif
701 return u16;
702
703# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
704# if RT_INLINE_ASM_USES_INTRIN
705# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
706 return __swpal16(pu16, u16);
707# else
708 uint16_t uOld = __swp16(pu16, u16);
709 __dmb(_ARM64_BARRIER_SY);
710 return uOld;
711# endif
712
713# else
714 uint32_t uOld;
715# if defined(RTASM_ARM64_USE_FEAT_LSE)
716 /* SWPALH is ~40% more expensive than the non-LSE variant on an M1, 20%
717 slower if we remove the barrier. But since we have the barrier we
718 shouldn't need that, right? Ordering should be taken care of by the DMB.
719 The SWPH is rather cheap (~70% faster). */
720 __asm__ __volatile__("Lstart_ASMAtomicXchgU16_%=:\n\t"
721# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
722 "swpalh %w[uNew], %w[uOld], %[pMem]\n\t"
723# else
724 RTASM_ARM_DMB_SY
725 "swph %w[uNew], %w[uOld], %[pMem]\n\t"
726# endif
727 : [pMem] "+Q" (*pu16)
728 , [uOld] "=&r" (uOld)
729 : [uNew] "r" ((uint32_t)u16)
730 : );
731# else
732 uint32_t rcSpill;
733 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU16_%=:\n\t"
734 RTASM_ARM_DMB_SY
735# if defined(RT_ARCH_ARM64)
736 "ldaxrh %w[uOld], %[pMem]\n\t"
737 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
738 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU16_%=\n\t"
739# else
740 "ldrexh %[uOld], %[pMem]\n\t" /* ARMv6+ */
741 "strexh %[rc], %[uNew], %[pMem]\n\t"
742 "cmp %[rc], #0\n\t"
743 "bne Ltry_again_ASMAtomicXchgU16_%=\n\t"
744# endif
745 : [pMem] "+Q" (*pu16)
746 , [uOld] "=&r" (uOld)
747 , [rc] "=&r" (rcSpill)
748 : [uNew] "r" ((uint32_t)u16)
749 RTASM_ARM_DMB_SY_COMMA_IN_REG
750 : "cc");
751# endif
752 return (uint16_t)uOld;
753# endif
754
755# else
756# error "Port me"
757# endif
758}
759#endif
760
761
762/**
763 * Atomically Exchange a signed 16-bit value, ordered.
764 *
765 * @returns Current *pu16 value
766 * @param pi16 Pointer to the 16-bit variable to update.
767 * @param i16 The 16-bit value to assign to *pi16.
768 */
769DECLINLINE(int16_t) ASMAtomicXchgS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
770{
771 return (int16_t)ASMAtomicXchgU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
772}
773
774
775/**
776 * Atomically Exchange an unsigned 32-bit value, ordered.
777 *
778 * @returns Current *pu32 value
779 * @param pu32 Pointer to the 32-bit variable to update.
780 * @param u32 The 32-bit value to assign to *pu32.
781 *
782 * @remarks Does not work on 286 and earlier.
783 */
784#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
785RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
786#else
787DECLINLINE(uint32_t) ASMAtomicXchgU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
788{
789# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
790# if RT_INLINE_ASM_GNU_STYLE
791 __asm__ __volatile__("xchgl %0, %1\n\t"
792 : "=m" (*pu32) /** @todo r=bird: +m rather than =m here? */
793 , "=r" (u32)
794 : "1" (u32)
795 , "m" (*pu32));
796
797# elif RT_INLINE_ASM_USES_INTRIN
798 u32 = _InterlockedExchange((long RT_FAR *)pu32, u32);
799
800# else
801 __asm
802 {
803# ifdef RT_ARCH_AMD64
804 mov rdx, [pu32]
805 mov eax, u32
806 xchg [rdx], eax
807 mov [u32], eax
808# else
809 mov edx, [pu32]
810 mov eax, u32
811 xchg [edx], eax
812 mov [u32], eax
813# endif
814 }
815# endif
816 return u32;
817
818# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
819
820# if RT_INLINE_ASM_USES_INTRIN
821# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
822 return __swpal32(pu32, u32);
823# else
824 uint32_t uOld = __swp32(pu32, u32);
825 __dmb(_ARM64_BARRIER_SY);
826 return uOld;
827# endif
828
829# else
830 uint32_t uOld;
831# if defined(RTASM_ARM64_USE_FEAT_LSE)
832 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
833 slower if we remove the barrier. But since we have the barrier we
834 shouldn't need that, right? Ordering should be taken care of by the DMB.
835 The SWP is rather cheap (~70% faster). */
836 __asm__ __volatile__("Lstart_ASMAtomicXchgU32_%=:\n\t"
837# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
838 "swpal %w[uNew], %w[uOld], %[pMem]\n\t"
839# else
840 RTASM_ARM_DMB_SY
841 "swp %w[uNew], %w[uOld], %[pMem]\n\t"
842# endif
843 : [pMem] "+Q" (*pu32)
844 , [uOld] "=&r" (uOld)
845 : [uNew] "r" (u32)
846 : );
847# else
848 uint32_t rcSpill;
849 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU32_%=:\n\t"
850 RTASM_ARM_DMB_SY
851# if defined(RT_ARCH_ARM64)
852 "ldaxr %w[uOld], %[pMem]\n\t"
853 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
854 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU32_%=\n\t"
855# else
856 "ldrex %[uOld], %[pMem]\n\t" /* ARMv6+ */
857 "strex %[rc], %[uNew], %[pMem]\n\t"
858 "cmp %[rc], #0\n\t"
859 "bne Ltry_again_ASMAtomicXchgU32_%=\n\t"
860# endif
861 : [pMem] "+Q" (*pu32)
862 , [uOld] "=&r" (uOld)
863 , [rc] "=&r" (rcSpill)
864 : [uNew] "r" (u32)
865 RTASM_ARM_DMB_SY_COMMA_IN_REG
866 : "cc");
867# endif
868 return uOld;
869# endif
870
871# else
872# error "Port me"
873# endif
874}
875#endif
876
877
878/**
879 * Atomically Exchange a signed 32-bit value, ordered.
880 *
881 * @returns Current *pu32 value
882 * @param pi32 Pointer to the 32-bit variable to update.
883 * @param i32 The 32-bit value to assign to *pi32.
884 */
885DECLINLINE(int32_t) ASMAtomicXchgS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
886{
887 return (int32_t)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
888}
889
890
891/**
892 * Atomically Exchange an unsigned 64-bit value, ordered.
893 *
894 * @returns Current *pu64 value
895 * @param pu64 Pointer to the 64-bit variable to update.
896 * @param u64 The 64-bit value to assign to *pu64.
897 *
898 * @remarks Works on 32-bit x86 CPUs starting with Pentium.
899 */
900#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
901 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
902RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
903#else
904DECLINLINE(uint64_t) ASMAtomicXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
905{
906# if defined(RT_ARCH_AMD64)
907# if RT_INLINE_ASM_USES_INTRIN
908 return _InterlockedExchange64((__int64 *)pu64, u64);
909
910# elif RT_INLINE_ASM_GNU_STYLE
911 __asm__ __volatile__("xchgq %0, %1\n\t"
912 : "=m" (*pu64)
913 , "=r" (u64)
914 : "1" (u64)
915 , "m" (*pu64));
916 return u64;
917# else
918 __asm
919 {
920 mov rdx, [pu64]
921 mov rax, [u64]
922 xchg [rdx], rax
923 mov [u64], rax
924 }
925 return u64;
926# endif
927
928# elif defined(RT_ARCH_X86)
929# if RT_INLINE_ASM_GNU_STYLE
930# if defined(PIC) || defined(__PIC__)
931 uint32_t u32EBX = (uint32_t)u64;
932 __asm__ __volatile__(/*"xchgl %%esi, %5\n\t"*/
933 "xchgl %%ebx, %3\n\t"
934 "1:\n\t"
935 "lock; cmpxchg8b (%5)\n\t"
936 "jnz 1b\n\t"
937 "movl %3, %%ebx\n\t"
938 /*"xchgl %%esi, %5\n\t"*/
939 : "=A" (u64)
940 , "=m" (*pu64)
941 : "0" (*pu64)
942 , "m" ( u32EBX )
943 , "c" ( (uint32_t)(u64 >> 32) )
944 , "S" (pu64)
945 : "cc");
946# else /* !PIC */
947 __asm__ __volatile__("1:\n\t"
948 "lock; cmpxchg8b %1\n\t"
949 "jnz 1b\n\t"
950 : "=A" (u64)
951 , "=m" (*pu64)
952 : "0" (*pu64)
953 , "b" ( (uint32_t)u64 )
954 , "c" ( (uint32_t)(u64 >> 32) )
955 : "cc");
956# endif
957# else
958 __asm
959 {
960 mov ebx, dword ptr [u64]
961 mov ecx, dword ptr [u64 + 4]
962 mov edi, pu64
963 mov eax, dword ptr [edi]
964 mov edx, dword ptr [edi + 4]
965 retry:
966 lock cmpxchg8b [edi]
967 jnz retry
968 mov dword ptr [u64], eax
969 mov dword ptr [u64 + 4], edx
970 }
971# endif
972 return u64;
973
974# elif defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64)
975# if RT_INLINE_ASM_USES_INTRIN
976# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
977 return __swpal64(pu64, u64);
978# else
979 uint64_t uOld = __swp64(pu64, u64);
980 __dmb(_ARM64_BARRIER_SY);
981 return uOld;
982# endif
983
984# else
985 uint64_t uOld;
986# if defined(RTASM_ARM64_USE_FEAT_LSE)
987 /* SWPAL is ~40% more expensive than the non-LSE variant on an M1, 20%
988 slower if we remove the barrier. But since we have the barrier we
989 shouldn't need that, right? Ordering should be taken care of by the DMB.
990 The SWP is rather cheap (~70% faster). */
991 __asm__ __volatile__("Lstart_ASMAtomicXchgU64_%=:\n\t"
992# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
993 "swpal %[uNew], %[uOld], %[pMem]\n\t"
994# else
995 RTASM_ARM_DMB_SY
996 "swp %[uNew], %[uOld], %[pMem]\n\t"
997# endif
998 : [pMem] "+Q" (*pu64)
999 , [uOld] "=&r" (uOld)
1000 : [uNew] "r" (u64)
1001 : );
1002# else
1003 uint32_t rcSpill;
1004 __asm__ __volatile__("Ltry_again_ASMAtomicXchgU64_%=:\n\t"
1005 RTASM_ARM_DMB_SY
1006# if defined(RT_ARCH_ARM64)
1007 "ldaxr %[uOld], %[pMem]\n\t"
1008 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1009 "cbnz %w[rc], Ltry_again_ASMAtomicXchgU64_%=\n\t"
1010# else
1011 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t" /* ARMv6+ */
1012 "strexd %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1013 "cmp %[rc], #0\n\t"
1014 "bne Ltry_again_ASMAtomicXchgU64_%=\n\t"
1015# endif
1016 : [pMem] "+Q" (*pu64)
1017 , [uOld] "=&r" (uOld)
1018 , [rc] "=&r" (rcSpill)
1019 : [uNew] "r" (u64)
1020 RTASM_ARM_DMB_SY_COMMA_IN_REG
1021 : "cc");
1022# endif
1023 return uOld;
1024# endif
1025
1026# else
1027# error "Port me"
1028# endif
1029}
1030#endif
1031
1032
1033/**
1034 * Atomically Exchange an signed 64-bit value, ordered.
1035 *
1036 * @returns Current *pi64 value
1037 * @param pi64 Pointer to the 64-bit variable to update.
1038 * @param i64 The 64-bit value to assign to *pi64.
1039 */
1040DECLINLINE(int64_t) ASMAtomicXchgS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
1041{
1042 return (int64_t)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
1043}
1044
1045
1046/**
1047 * Atomically Exchange a size_t value, ordered.
1048 *
1049 * @returns Current *ppv value
1050 * @param puDst Pointer to the size_t variable to update.
1051 * @param uNew The new value to assign to *puDst.
1052 */
1053DECLINLINE(size_t) ASMAtomicXchgZ(size_t volatile RT_FAR *puDst, const size_t uNew) RT_NOTHROW_DEF
1054{
1055#if ARCH_BITS == 16
1056 AssertCompile(sizeof(size_t) == 2);
1057 return ASMAtomicXchgU16((volatile uint16_t RT_FAR *)puDst, uNew);
1058#elif ARCH_BITS == 32
1059 return ASMAtomicXchgU32((volatile uint32_t RT_FAR *)puDst, uNew);
1060#elif ARCH_BITS == 64
1061 return ASMAtomicXchgU64((volatile uint64_t RT_FAR *)puDst, uNew);
1062#else
1063# error "ARCH_BITS is bogus"
1064#endif
1065}
1066
1067
1068/**
1069 * Atomically Exchange a pointer value, ordered.
1070 *
1071 * @returns Current *ppv value
1072 * @param ppv Pointer to the pointer variable to update.
1073 * @param pv The pointer value to assign to *ppv.
1074 */
1075DECLINLINE(void RT_FAR *) ASMAtomicXchgPtr(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pv) RT_NOTHROW_DEF
1076{
1077#if ARCH_BITS == 32 || ARCH_BITS == 16
1078 return (void RT_FAR *)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
1079#elif ARCH_BITS == 64
1080 return (void RT_FAR *)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
1081#else
1082# error "ARCH_BITS is bogus"
1083#endif
1084}
1085
1086
1087/**
1088 * Convenience macro for avoiding the annoying casting with ASMAtomicXchgPtr.
1089 *
1090 * @returns Current *pv value
1091 * @param ppv Pointer to the pointer variable to update.
1092 * @param pv The pointer value to assign to *ppv.
1093 * @param Type The type of *ppv, sans volatile.
1094 */
1095#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
1096# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1097 __extension__ \
1098 ({\
1099 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1100 Type const pvTypeChecked = (pv); \
1101 Type pvTypeCheckedRet = (__typeof__(*(ppv))) ASMAtomicXchgPtr((void * volatile *)ppvTypeChecked, (void *)pvTypeChecked); \
1102 pvTypeCheckedRet; \
1103 })
1104#else
1105# define ASMAtomicXchgPtrT(ppv, pv, Type) \
1106 (Type)ASMAtomicXchgPtr((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv))
1107#endif
1108
1109
1110/**
1111 * Atomically Exchange a raw-mode context pointer value, ordered.
1112 *
1113 * @returns Current *ppv value
1114 * @param ppvRC Pointer to the pointer variable to update.
1115 * @param pvRC The pointer value to assign to *ppv.
1116 */
1117DECLINLINE(RTRCPTR) ASMAtomicXchgRCPtr(RTRCPTR volatile RT_FAR *ppvRC, RTRCPTR pvRC) RT_NOTHROW_DEF
1118{
1119 return (RTRCPTR)ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(void RT_FAR *)ppvRC, (uint32_t)pvRC);
1120}
1121
1122
1123/**
1124 * Atomically Exchange a ring-0 pointer value, ordered.
1125 *
1126 * @returns Current *ppv value
1127 * @param ppvR0 Pointer to the pointer variable to update.
1128 * @param pvR0 The pointer value to assign to *ppv.
1129 */
1130DECLINLINE(RTR0PTR) ASMAtomicXchgR0Ptr(RTR0PTR volatile RT_FAR *ppvR0, RTR0PTR pvR0) RT_NOTHROW_DEF
1131{
1132#if R0_ARCH_BITS == 32 || ARCH_BITS == 16
1133 return (RTR0PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR0, (uint32_t)pvR0);
1134#elif R0_ARCH_BITS == 64
1135 return (RTR0PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR0, (uint64_t)pvR0);
1136#else
1137# error "R0_ARCH_BITS is bogus"
1138#endif
1139}
1140
1141
1142/**
1143 * Atomically Exchange a ring-3 pointer value, ordered.
1144 *
1145 * @returns Current *ppv value
1146 * @param ppvR3 Pointer to the pointer variable to update.
1147 * @param pvR3 The pointer value to assign to *ppv.
1148 */
1149DECLINLINE(RTR3PTR) ASMAtomicXchgR3Ptr(RTR3PTR volatile RT_FAR *ppvR3, RTR3PTR pvR3) RT_NOTHROW_DEF
1150{
1151#if R3_ARCH_BITS == 32 || ARCH_BITS == 16
1152 return (RTR3PTR)ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppvR3, (uint32_t)pvR3);
1153#elif R3_ARCH_BITS == 64
1154 return (RTR3PTR)ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppvR3, (uint64_t)pvR3);
1155#else
1156# error "R3_ARCH_BITS is bogus"
1157#endif
1158}
1159
1160
1161/** @def ASMAtomicXchgHandle
1162 * Atomically Exchange a typical IPRT handle value, ordered.
1163 *
1164 * @param ph Pointer to the value to update.
1165 * @param hNew The new value to assigned to *pu.
1166 * @param phRes Where to store the current *ph value.
1167 *
1168 * @remarks This doesn't currently work for all handles (like RTFILE).
1169 */
1170#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1171# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1172 do { \
1173 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1174 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
1175 *(uint32_t RT_FAR *)(phRes) = ASMAtomicXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
1176 } while (0)
1177#elif HC_ARCH_BITS == 64
1178# define ASMAtomicXchgHandle(ph, hNew, phRes) \
1179 do { \
1180 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1181 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
1182 *(uint64_t RT_FAR *)(phRes) = ASMAtomicXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
1183 } while (0)
1184#else
1185# error HC_ARCH_BITS
1186#endif
1187
1188
1189/**
1190 * Atomically Exchange a value which size might differ
1191 * between platforms or compilers, ordered.
1192 *
1193 * @param pu Pointer to the variable to update.
1194 * @param uNew The value to assign to *pu.
1195 * @todo This is busted as its missing the result argument.
1196 */
1197#define ASMAtomicXchgSize(pu, uNew) \
1198 do { \
1199 switch (sizeof(*(pu))) { \
1200 case 1: ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1201 case 2: ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1202 case 4: ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1203 case 8: ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1204 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1205 } \
1206 } while (0)
1207
1208/**
1209 * Atomically Exchange a value which size might differ
1210 * between platforms or compilers, ordered.
1211 *
1212 * @param pu Pointer to the variable to update.
1213 * @param uNew The value to assign to *pu.
1214 * @param puRes Where to store the current *pu value.
1215 */
1216#define ASMAtomicXchgSizeCorrect(pu, uNew, puRes) \
1217 do { \
1218 switch (sizeof(*(pu))) { \
1219 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicXchgU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t)(uNew)); break; \
1220 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicXchgU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
1221 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
1222 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
1223 default: AssertMsgFailed(("ASMAtomicXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1224 } \
1225 } while (0)
1226
1227
1228
1229/**
1230 * Atomically Compare and Exchange an unsigned 8-bit value, ordered.
1231 *
1232 * @returns true if xchg was done.
1233 * @returns false if xchg wasn't done.
1234 *
1235 * @param pu8 Pointer to the value to update.
1236 * @param u8New The new value to assigned to *pu8.
1237 * @param u8Old The old value to *pu8 compare with.
1238 *
1239 * @remarks x86: Requires a 486 or later.
1240 * @todo Rename ASMAtomicCmpWriteU8
1241 */
1242#if RT_INLINE_ASM_EXTERNAL_TMP_ARM || (!RT_INLINE_ASM_GNU_STYLE && !defined(RT_ARCH_ARM64) && !defined(RT_ARCH_ARM32))
1243RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old) RT_NOTHROW_PROTO;
1244#else
1245DECLINLINE(bool) ASMAtomicCmpXchgU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, uint8_t u8Old) RT_NOTHROW_DEF
1246{
1247# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1248 uint8_t u8Ret;
1249 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1250 "setz %1\n\t"
1251 : "=m" (*pu8)
1252 , "=qm" (u8Ret)
1253 , "=a" (u8Old)
1254 : "q" (u8New)
1255 , "2" (u8Old)
1256 , "m" (*pu8)
1257 : "cc");
1258 return (bool)u8Ret;
1259
1260# elif RT_INLINE_ASM_USES_INTRIN
1261 return (uint8_t)_InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old) == u8Old;
1262
1263# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1264 union { uint32_t u; bool f; } fXchg;
1265 uint32_t u32Spill;
1266# if defined(RTASM_ARM64_USE_FEAT_LSE)
1267 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU8_%=:\n\t"
1268# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB) /* M1 bench: casalb=5625 vs dmb+casb=1597 vs non-lse=5623 (ps/call) */
1269 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1270# else
1271 RTASM_ARM_DMB_SY
1272 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
1273# endif
1274 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1275 "cset %w[fXchg], eq\n\t"
1276 : [pMem] "+Q" (*pu8)
1277 , [uOldActual] "=&r" (u32Spill)
1278 , [fXchg] "=&r" (fXchg.u)
1279 : [uNew] "r" ((uint32_t)u8New)
1280 , [uOldOrg] "r" ((uint32_t)u8Old)
1281 , "[uOldActual]" ((uint32_t)u8Old)
1282 : "cc");
1283# else
1284 uint32_t rcSpill;
1285 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU8_%=:\n\t"
1286 RTASM_ARM_DMB_SY
1287# if defined(RT_ARCH_ARM64)
1288 "ldaxrb %w[uOld], %[pMem]\n\t"
1289 "cmp %w[uOld], %w[uCmp]\n\t"
1290 "bne 1f\n\t" /* stop here if not equal */
1291 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
1292 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1293 "mov %w[fXchg], #1\n\t"
1294 "1:\n\t"
1295 "clrex\n\t"
1296# else
1297 "ldrexb %[uOld], %[pMem]\n\t"
1298 "teq %[uOld], %[uCmp]\n\t"
1299 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
1300 "bne 1f\n\t" /* stop here if not equal */
1301 "cmp %[rc], #0\n\t"
1302 "bne Ltry_again_ASMAtomicCmpXchgU8_%=\n\t"
1303 "mov %[fXchg], #1\n\t"
1304 "1:\n\t"
1305 /** @todo clrexne on armv7? */
1306# endif
1307 : [pMem] "+Q" (*pu8)
1308 , [uOld] "=&r" (u32Spill)
1309 , [rc] "=&r" (rcSpill)
1310 , [fXchg] "=&r" (fXchg.u)
1311 : [uCmp] "r" ((uint32_t)u8Old)
1312 , [uNew] "r" ((uint32_t)u8New)
1313 , "[fXchg]" (0)
1314 RTASM_ARM_DMB_SY_COMMA_IN_REG
1315 : "cc");
1316# endif
1317 return fXchg.f;
1318
1319# else
1320# error "Port me"
1321# endif
1322}
1323#endif
1324
1325
1326/**
1327 * Atomically Compare and Exchange a signed 8-bit value, ordered.
1328 *
1329 * @returns true if xchg was done.
1330 * @returns false if xchg wasn't done.
1331 *
1332 * @param pi8 Pointer to the value to update.
1333 * @param i8New The new value to assigned to *pi8.
1334 * @param i8Old The old value to *pi8 compare with.
1335 *
1336 * @remarks x86: Requires a 486 or later.
1337 * @todo Rename ASMAtomicCmpWriteS8
1338 */
1339DECLINLINE(bool) ASMAtomicCmpXchgS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old) RT_NOTHROW_DEF
1340{
1341 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old);
1342}
1343
1344
1345/**
1346 * Atomically Compare and Exchange a bool value, ordered.
1347 *
1348 * @returns true if xchg was done.
1349 * @returns false if xchg wasn't done.
1350 *
1351 * @param pf Pointer to the value to update.
1352 * @param fNew The new value to assigned to *pf.
1353 * @param fOld The old value to *pf compare with.
1354 *
1355 * @remarks x86: Requires a 486 or later.
1356 * @todo Rename ASMAtomicCmpWriteBool
1357 */
1358DECLINLINE(bool) ASMAtomicCmpXchgBool(volatile bool RT_FAR *pf, const bool fNew, const bool fOld) RT_NOTHROW_DEF
1359{
1360 return ASMAtomicCmpXchgU8((volatile uint8_t RT_FAR *)pf, (uint8_t)fNew, (uint8_t)fOld);
1361}
1362
1363
1364/**
1365 * Atomically Compare and Exchange an unsigned 32-bit value, ordered.
1366 *
1367 * @returns true if xchg was done.
1368 * @returns false if xchg wasn't done.
1369 *
1370 * @param pu32 Pointer to the value to update.
1371 * @param u32New The new value to assigned to *pu32.
1372 * @param u32Old The old value to *pu32 compare with.
1373 *
1374 * @remarks x86: Requires a 486 or later.
1375 * @todo Rename ASMAtomicCmpWriteU32
1376 */
1377#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1378RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old) RT_NOTHROW_PROTO;
1379#else
1380DECLINLINE(bool) ASMAtomicCmpXchgU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, uint32_t u32Old) RT_NOTHROW_DEF
1381{
1382# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1383# if RT_INLINE_ASM_GNU_STYLE
1384 uint8_t u8Ret;
1385 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
1386 "setz %1\n\t"
1387 : "=m" (*pu32)
1388 , "=qm" (u8Ret)
1389 , "=a" (u32Old)
1390 : "r" (u32New)
1391 , "2" (u32Old)
1392 , "m" (*pu32)
1393 : "cc");
1394 return (bool)u8Ret;
1395
1396# elif RT_INLINE_ASM_USES_INTRIN
1397 return (uint32_t)_InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old) == u32Old;
1398
1399# else
1400 uint32_t u32Ret;
1401 __asm
1402 {
1403# ifdef RT_ARCH_AMD64
1404 mov rdx, [pu32]
1405# else
1406 mov edx, [pu32]
1407# endif
1408 mov eax, [u32Old]
1409 mov ecx, [u32New]
1410# ifdef RT_ARCH_AMD64
1411 lock cmpxchg [rdx], ecx
1412# else
1413 lock cmpxchg [edx], ecx
1414# endif
1415 setz al
1416 movzx eax, al
1417 mov [u32Ret], eax
1418 }
1419 return !!u32Ret;
1420# endif
1421
1422# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1423# if RT_INLINE_ASM_USES_INTRIN
1424# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1425 uint32_t const uOldActual = __casal32(pu32, u32Old, u32New);
1426# else
1427 __dmb(_ARM64_BARRIER_SY);
1428 uint32_t const uOldActual = __cas32(pu32, u32Old, u32New);
1429# endif
1430 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1431
1432# else
1433 union { uint32_t u; bool f; } fXchg;
1434 uint32_t u32Spill;
1435 /* M1 bench: match: casal= 6592 vs dmb+cas= 1562 vs non-lse=5634 (ps/call)
1436 mismatch: casal=18794 vs dmb+cas=19697 vs non-lse=2499 (ps/call) */
1437# if defined(RTASM_ARM64_USE_FEAT_LSE)
1438 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU32_%=:\n\t"
1439# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1440 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
1441# else
1442 RTASM_ARM_DMB_SY
1443 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
1444# endif
1445 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
1446 "cset %w[fXchg], eq\n\t"
1447 : [pMem] "+Q" (*pu32)
1448 , [uOldActual] "=&r" (u32Spill)
1449 , [fXchg] "=&r" (fXchg.u)
1450 : [uNew] "r" (u32New)
1451 , [uOldOrg] "r" (u32Old)
1452 , "[uOldActual]" (u32Old)
1453 : "cc");
1454# else
1455 uint32_t rcSpill;
1456 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU32_%=:\n\t"
1457 RTASM_ARM_DMB_SY
1458# if defined(RT_ARCH_ARM64)
1459 "ldaxr %w[uOld], %[pMem]\n\t"
1460 "cmp %w[uOld], %w[uCmp]\n\t"
1461 "bne 1f\n\t" /* stop here if not equal */
1462 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
1463 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1464 "mov %w[fXchg], #1\n\t"
1465 "1:\n\t"
1466 "clrex\n\t"
1467# else
1468 "ldrex %[uOld], %[pMem]\n\t"
1469 "teq %[uOld], %[uCmp]\n\t"
1470 "strexeq %[rc], %[uNew], %[pMem]\n\t"
1471 "bne 1f\n\t" /* stop here if not equal */
1472 "cmp %[rc], #0\n\t"
1473 "bne Ltry_again_ASMAtomicCmpXchgU32_%=\n\t"
1474 "mov %[fXchg], #1\n\t"
1475 "1:\n\t"
1476 /** @todo clrexne on armv7? */
1477# endif
1478 : [pMem] "+Q" (*pu32)
1479 , [uOld] "=&r" (u32Spill)
1480 , [rc] "=&r" (rcSpill)
1481 , [fXchg] "=&r" (fXchg.u)
1482 : [uCmp] "r" (u32Old)
1483 , [uNew] "r" (u32New)
1484 , "[fXchg]" (0)
1485 RTASM_ARM_DMB_SY_COMMA_IN_REG
1486 : "cc");
1487# endif
1488 return fXchg.f;
1489# endif
1490
1491# else
1492# error "Port me"
1493# endif
1494}
1495#endif
1496
1497
1498/**
1499 * Atomically Compare and Exchange a signed 32-bit value, ordered.
1500 *
1501 * @returns true if xchg was done.
1502 * @returns false if xchg wasn't done.
1503 *
1504 * @param pi32 Pointer to the value to update.
1505 * @param i32New The new value to assigned to *pi32.
1506 * @param i32Old The old value to *pi32 compare with.
1507 *
1508 * @remarks x86: Requires a 486 or later.
1509 * @todo Rename ASMAtomicCmpWriteS32
1510 */
1511DECLINLINE(bool) ASMAtomicCmpXchgS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old) RT_NOTHROW_DEF
1512{
1513 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old);
1514}
1515
1516
1517/**
1518 * Atomically Compare and exchange an unsigned 64-bit value, ordered.
1519 *
1520 * @returns true if xchg was done.
1521 * @returns false if xchg wasn't done.
1522 *
1523 * @param pu64 Pointer to the 64-bit variable to update.
1524 * @param u64New The 64-bit value to assign to *pu64.
1525 * @param u64Old The value to compare with.
1526 *
1527 * @remarks x86: Requires a Pentium or later.
1528 * @todo Rename ASMAtomicCmpWriteU64
1529 */
1530#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
1531 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
1532RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old) RT_NOTHROW_PROTO;
1533#else
1534DECLINLINE(bool) ASMAtomicCmpXchgU64(volatile uint64_t RT_FAR *pu64, uint64_t u64New, uint64_t u64Old) RT_NOTHROW_DEF
1535{
1536# if RT_INLINE_ASM_USES_INTRIN
1537 return (uint64_t)_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old) == u64Old;
1538
1539# elif defined(RT_ARCH_AMD64)
1540# if RT_INLINE_ASM_GNU_STYLE
1541 uint8_t u8Ret;
1542 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
1543 "setz %1\n\t"
1544 : "=m" (*pu64)
1545 , "=qm" (u8Ret)
1546 , "=a" (u64Old)
1547 : "r" (u64New)
1548 , "2" (u64Old)
1549 , "m" (*pu64)
1550 : "cc");
1551 return (bool)u8Ret;
1552# else
1553 bool fRet;
1554 __asm
1555 {
1556 mov rdx, [pu32]
1557 mov rax, [u64Old]
1558 mov rcx, [u64New]
1559 lock cmpxchg [rdx], rcx
1560 setz al
1561 mov [fRet], al
1562 }
1563 return fRet;
1564# endif
1565
1566# elif defined(RT_ARCH_X86)
1567 uint32_t u32Ret;
1568# if RT_INLINE_ASM_GNU_STYLE
1569# if defined(PIC) || defined(__PIC__)
1570 uint32_t u32EBX = (uint32_t)u64New;
1571 uint32_t u32Spill;
1572 __asm__ __volatile__("xchgl %%ebx, %4\n\t"
1573 "lock; cmpxchg8b (%6)\n\t"
1574 "setz %%al\n\t"
1575 "movl %4, %%ebx\n\t"
1576 "movzbl %%al, %%eax\n\t"
1577 : "=a" (u32Ret)
1578 , "=d" (u32Spill)
1579# if RT_GNUC_PREREQ(4, 3)
1580 , "+m" (*pu64)
1581# else
1582 , "=m" (*pu64)
1583# endif
1584 : "A" (u64Old)
1585 , "m" ( u32EBX )
1586 , "c" ( (uint32_t)(u64New >> 32) )
1587 , "S" (pu64)
1588 : "cc");
1589# else /* !PIC */
1590 uint32_t u32Spill;
1591 __asm__ __volatile__("lock; cmpxchg8b %2\n\t"
1592 "setz %%al\n\t"
1593 "movzbl %%al, %%eax\n\t"
1594 : "=a" (u32Ret)
1595 , "=d" (u32Spill)
1596 , "+m" (*pu64)
1597 : "A" (u64Old)
1598 , "b" ( (uint32_t)u64New )
1599 , "c" ( (uint32_t)(u64New >> 32) )
1600 : "cc");
1601# endif
1602 return (bool)u32Ret;
1603# else
1604 __asm
1605 {
1606 mov ebx, dword ptr [u64New]
1607 mov ecx, dword ptr [u64New + 4]
1608 mov edi, [pu64]
1609 mov eax, dword ptr [u64Old]
1610 mov edx, dword ptr [u64Old + 4]
1611 lock cmpxchg8b [edi]
1612 setz al
1613 movzx eax, al
1614 mov dword ptr [u32Ret], eax
1615 }
1616 return !!u32Ret;
1617# endif
1618
1619# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
1620# if RT_INLINE_ASM_USES_INTRIN
1621# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1622 uint64_t const uOldActual = __casal64(pu64, u64Old, u64New);
1623# else
1624 __dmb(_ARM64_BARRIER_SY);
1625 uint64_t const uOldActual = __cas64(pu64, u64Old, u64New);
1626# endif
1627 return uOldActual == u64Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
1628
1629# else
1630 union { uint32_t u; bool f; } fXchg;
1631 uint64_t u64Spill;
1632 /* M1 bench: match: casal= 6599 vs dmb+cas= 1565 vs non-lse=5000 (ps/call)
1633 mismatch: casal=18797 vs dmb+cas=19731 vs non-lse=2512 (ps/call) */
1634# if defined(RTASM_ARM64_USE_FEAT_LSE)
1635 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgU75_%=:\n\t"
1636# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
1637 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
1638# else
1639 RTASM_ARM_DMB_SY
1640 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
1641# endif
1642 "cmp %[uOldActual], %[uOldOrg]\n\t"
1643 "cset %w[fXchg], eq\n\t"
1644 : [pMem] "+Q" (*pu64)
1645 , [uOldActual] "=&r" (u64Spill)
1646 , [fXchg] "=&r" (fXchg.u)
1647 : [uNew] "r" (u64New)
1648 , [uOldOrg] "r" (u64Old)
1649 , "[uOldActual]" (u64Old)
1650 : "cc");
1651# else
1652 uint32_t rcSpill;
1653 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
1654 RTASM_ARM_DMB_SY
1655# if defined(RT_ARCH_ARM64)
1656 "ldaxr %[uOld], %[pMem]\n\t"
1657 "cmp %[uOld], %[uCmp]\n\t"
1658 "bne 1f\n\t" /* stop here if not equal */
1659 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
1660 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1661 "mov %w[fXchg], #1\n\t"
1662 "1:\n\t"
1663 "clrex\n\t"
1664# else
1665 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
1666 "teq %[uOld], %[uCmp]\n\t"
1667 "teqeq %H[uOld], %H[uCmp]\n\t"
1668 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
1669 "bne 1f\n\t" /* stop here if not equal */
1670 "cmp %[rc], #0\n\t"
1671 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
1672 "mov %[fXchg], #1\n\t"
1673 "1:\n\t"
1674 /** @todo clrexne on armv7? */
1675# endif
1676 : [pMem] "+Q" (*pu64)
1677 , [uOld] "=&r" (u64Spill)
1678 , [rc] "=&r" (rcSpill)
1679 , [fXchg] "=&r" (fXchg.u)
1680 : [uCmp] "r" (u64Old)
1681 , [uNew] "r" (u64New)
1682 , "[fXchg]" (0)
1683 RTASM_ARM_DMB_SY_COMMA_IN_REG
1684 : "cc");
1685# endif
1686 return fXchg.f;
1687# endif
1688
1689# else
1690# error "Port me"
1691# endif
1692}
1693#endif
1694
1695
1696/**
1697 * Atomically Compare and exchange a signed 64-bit value, ordered.
1698 *
1699 * @returns true if xchg was done.
1700 * @returns false if xchg wasn't done.
1701 *
1702 * @param pi64 Pointer to the 64-bit variable to update.
1703 * @param i64 The 64-bit value to assign to *pu64.
1704 * @param i64Old The value to compare with.
1705 *
1706 * @remarks x86: Requires a Pentium or later.
1707 * @todo Rename ASMAtomicCmpWriteS64
1708 */
1709DECLINLINE(bool) ASMAtomicCmpXchgS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old) RT_NOTHROW_DEF
1710{
1711 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old);
1712}
1713
1714#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
1715
1716/** @def RTASM_HAVE_CMP_WRITE_U128
1717 * Indicates that we've got ASMAtomicCmpWriteU128(), ASMAtomicCmpWriteU128v2()
1718 * and ASMAtomicCmpWriteExU128() available. */
1719# define RTASM_HAVE_CMP_WRITE_U128 1
1720
1721
1722/**
1723 * Atomically compare and write an unsigned 128-bit value, ordered.
1724 *
1725 * @returns true if write was done.
1726 * @returns false if write wasn't done.
1727 *
1728 * @param pu128 Pointer to the 128-bit variable to update.
1729 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
1730 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
1731 * @param u64OldHi The high 64-bit of the value to compare with.
1732 * @param u64OldLo The low 64-bit of the value to compare with.
1733 *
1734 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1735 */
1736# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
1737DECLASM(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1738 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_PROTO;
1739# else
1740DECLINLINE(bool) ASMAtomicCmpWriteU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
1741 const uint64_t u64OldHi, const uint64_t u64OldLo) RT_NOTHROW_DEF
1742{
1743# if RT_INLINE_ASM_USES_INTRIN
1744 __int64 ai64Cmp[2];
1745 ai64Cmp[0] = u64OldLo;
1746 ai64Cmp[1] = u64OldHi;
1747 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, ai64Cmp) != 0;
1748
1749# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1750 return __sync_bool_compare_and_swap(pu128, ((uint128_t)u64OldHi << 64) | u64OldLo, ((uint128_t)u64NewHi << 64) | u64NewLo);
1751
1752# elif defined(RT_ARCH_AMD64)
1753# if RT_INLINE_ASM_GNU_STYLE
1754 uint64_t u64Ret;
1755 uint64_t u64Spill;
1756 __asm__ __volatile__("lock; cmpxchg16b %2\n\t"
1757 "setz %%al\n\t"
1758 "movzbl %%al, %%eax\n\t"
1759 : "=a" (u64Ret)
1760 , "=d" (u64Spill)
1761 , "+m" (*pu128)
1762 : "a" (u64OldLo)
1763 , "d" (u64OldHi)
1764 , "b" (u64NewLo)
1765 , "c" (u64NewHi)
1766 : "cc");
1767
1768 return (bool)u64Ret;
1769# else
1770# error "Port me"
1771# endif
1772# else
1773# error "Port me"
1774# endif
1775}
1776# endif
1777
1778
1779/**
1780 * Atomically compare and write an unsigned 128-bit value, ordered.
1781 *
1782 * @returns true if write was done.
1783 * @returns false if write wasn't done.
1784 *
1785 * @param pu128 Pointer to the 128-bit variable to update.
1786 * @param u128New The 128-bit value to assign to *pu128.
1787 * @param u128Old The value to compare with.
1788 *
1789 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
1790 */
1791DECLINLINE(bool) ASMAtomicCmpWriteU128(volatile uint128_t *pu128, const uint128_t u128New, const uint128_t u128Old) RT_NOTHROW_DEF
1792{
1793# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
1794# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1795 return __sync_bool_compare_and_swap(pu128, u128Old, u128New);
1796# else
1797 return ASMAtomicCmpWriteU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
1798 (uint64_t)(u128Old >> 64), (uint64_t)u128Old);
1799# endif
1800# else
1801 return ASMAtomicCmpWriteU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo);
1802# endif
1803}
1804
1805
1806/**
1807 * RTUINT128U wrapper for ASMAtomicCmpWriteU128.
1808 */
1809DECLINLINE(bool) ASMAtomicCmpWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
1810 const RTUINT128U u128Old) RT_NOTHROW_DEF
1811{
1812# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
1813 return ASMAtomicCmpWriteU128(&pu128->u, u128New.u, u128Old.u);
1814# else
1815 return ASMAtomicCmpWriteU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo);
1816# endif
1817}
1818
1819#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
1820
1821/**
1822 * Atomically Compare and Exchange a pointer value, ordered.
1823 *
1824 * @returns true if xchg was done.
1825 * @returns false if xchg wasn't done.
1826 *
1827 * @param ppv Pointer to the value to update.
1828 * @param pvNew The new value to assigned to *ppv.
1829 * @param pvOld The old value to *ppv compare with.
1830 *
1831 * @remarks x86: Requires a 486 or later.
1832 * @todo Rename ASMAtomicCmpWritePtrVoid
1833 */
1834DECLINLINE(bool) ASMAtomicCmpXchgPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld) RT_NOTHROW_DEF
1835{
1836#if ARCH_BITS == 32 || ARCH_BITS == 16
1837 return ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld);
1838#elif ARCH_BITS == 64
1839 return ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld);
1840#else
1841# error "ARCH_BITS is bogus"
1842#endif
1843}
1844
1845
1846/**
1847 * Atomically Compare and Exchange a pointer value, ordered.
1848 *
1849 * @returns true if xchg was done.
1850 * @returns false if xchg wasn't done.
1851 *
1852 * @param ppv Pointer to the value to update.
1853 * @param pvNew The new value to assigned to *ppv.
1854 * @param pvOld The old value to *ppv compare with.
1855 *
1856 * @remarks This is relatively type safe on GCC platforms.
1857 * @remarks x86: Requires a 486 or later.
1858 * @todo Rename ASMAtomicCmpWritePtr
1859 */
1860#ifdef __GNUC__
1861# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1862 __extension__ \
1863 ({\
1864 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
1865 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
1866 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
1867 bool fMacroRet = ASMAtomicCmpXchgPtrVoid((void * volatile *)ppvTypeChecked, \
1868 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked); \
1869 fMacroRet; \
1870 })
1871#else
1872# define ASMAtomicCmpXchgPtr(ppv, pvNew, pvOld) \
1873 ASMAtomicCmpXchgPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld))
1874#endif
1875
1876
1877/** @def ASMAtomicCmpXchgHandle
1878 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
1879 *
1880 * @param ph Pointer to the value to update.
1881 * @param hNew The new value to assigned to *pu.
1882 * @param hOld The old value to *pu compare with.
1883 * @param fRc Where to store the result.
1884 *
1885 * @remarks This doesn't currently work for all handles (like RTFILE).
1886 * @remarks x86: Requires a 486 or later.
1887 * @todo Rename ASMAtomicCmpWriteHandle
1888 */
1889#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
1890# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1891 do { \
1892 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
1893 (fRc) = ASMAtomicCmpXchgU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew), (const uint32_t)(hOld)); \
1894 } while (0)
1895#elif HC_ARCH_BITS == 64
1896# define ASMAtomicCmpXchgHandle(ph, hNew, hOld, fRc) \
1897 do { \
1898 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
1899 (fRc) = ASMAtomicCmpXchgU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew), (const uint64_t)(hOld)); \
1900 } while (0)
1901#else
1902# error HC_ARCH_BITS
1903#endif
1904
1905
1906/** @def ASMAtomicCmpXchgSize
1907 * Atomically Compare and Exchange a value which size might differ
1908 * between platforms or compilers, ordered.
1909 *
1910 * @param pu Pointer to the value to update.
1911 * @param uNew The new value to assigned to *pu.
1912 * @param uOld The old value to *pu compare with.
1913 * @param fRc Where to store the result.
1914 *
1915 * @remarks x86: Requires a 486 or later.
1916 * @todo Rename ASMAtomicCmpWriteSize
1917 */
1918#define ASMAtomicCmpXchgSize(pu, uNew, uOld, fRc) \
1919 do { \
1920 switch (sizeof(*(pu))) { \
1921 case 4: (fRc) = ASMAtomicCmpXchgU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld)); \
1922 break; \
1923 case 8: (fRc) = ASMAtomicCmpXchgU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld)); \
1924 break; \
1925 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
1926 (fRc) = false; \
1927 break; \
1928 } \
1929 } while (0)
1930
1931
1932/**
1933 * Atomically Compare and Exchange an unsigned 8-bit value, additionally passes
1934 * back old value, ordered.
1935 *
1936 * @returns true if xchg was done.
1937 * @returns false if xchg wasn't done.
1938 *
1939 * @param pu8 Pointer to the value to update.
1940 * @param u8New The new value to assigned to *pu32.
1941 * @param u8Old The old value to *pu8 compare with.
1942 * @param pu8Old Pointer store the old value at.
1943 *
1944 * @remarks x86: Requires a 486 or later.
1945 */
1946#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
1947RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_PROTO;
1948#else
1949DECLINLINE(bool) ASMAtomicCmpXchgExU8(volatile uint8_t RT_FAR *pu8, const uint8_t u8New, const uint8_t u8Old, uint8_t RT_FAR *pu8Old) RT_NOTHROW_DEF
1950{
1951# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
1952# if RT_INLINE_ASM_GNU_STYLE
1953 uint8_t u8Ret;
1954 __asm__ __volatile__("lock; cmpxchgb %3, %0\n\t"
1955 "setz %1\n\t"
1956 : "=m" (*pu8)
1957 , "=qm" (u8Ret)
1958 , "=a" (*pu8Old)
1959# if defined(RT_ARCH_X86)
1960 : "q" (u8New)
1961# else
1962 : "r" (u8New)
1963# endif
1964 , "a" (u8Old)
1965 , "m" (*pu8)
1966 : "cc");
1967 return (bool)u8Ret;
1968
1969# elif RT_INLINE_ASM_USES_INTRIN
1970 return (*pu8Old = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old)) == u8Old;
1971
1972# else
1973 uint8_t u8Ret;
1974 __asm
1975 {
1976# ifdef RT_ARCH_AMD64
1977 mov rdx, [pu8]
1978# else
1979 mov edx, [pu8]
1980# endif
1981 mov eax, [u8Old]
1982 mov ecx, [u8New]
1983# ifdef RT_ARCH_AMD64
1984 lock cmpxchg [rdx], ecx
1985 mov rdx, [pu8Old]
1986 mov [rdx], eax
1987# else
1988 lock cmpxchg [edx], ecx
1989 mov edx, [pu8Old]
1990 mov [edx], eax
1991# endif
1992 setz al
1993 movzx eax, al
1994 mov [u8Ret], eax
1995 }
1996 return !!u8Ret;
1997# endif
1998
1999# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2000# if RT_INLINE_ASM_USES_INTRIN
2001# if defined(RTASM_ARM64_USE_FEAT_LSE)
2002# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2003 uint8_t const uOldActual = __casal8(pu8, u8Old, u8New);
2004# else
2005 __dmb(_ARM64_BARRIER_SY);
2006 uint8_t const uOldActual = __cas8(pu8, u8Old, u8New);
2007# endif
2008# else
2009 uint8_t const uOldActual = _InterlockedCompareExchange8((char RT_FAR *)pu8, u8New, u8Old);
2010# endif
2011 *pu8Old = uOldActual;
2012 return uOldActual == u8Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2013
2014# else
2015 /* M1 bench: match: casalb= 6594 vs dmb+casb= 1561 vs non-lse=5051 (ps/call)
2016 mismatch: casalb=15346 vs dmb+casb=16349 vs non-lse=2505 (ps/call) */
2017# if defined(RTASM_ARM64_USE_FEAT_LSE)
2018 union { uint32_t u; bool f; } fXchg;
2019 uint32_t u32Actual;
2020 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU8_%=:\n\t"
2021# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2022 "casalb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2023# else
2024 RTASM_ARM_DMB_SY
2025 "casb %w[uOldActual], %w[uNew], %[pMem]\n\t"
2026# endif
2027 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2028 "cset %w[fXchg], eq\n\t"
2029 : [pMem] "+Q" (*pu8)
2030 , [uOldActual] "=&r" (u32Actual)
2031 , [fXchg] "=&r" (fXchg.u)
2032 : [uNew] "r" ((uint32_t)u8New)
2033 , [uOldOrg] "r" ((uint32_t)u8Old)
2034 , "[uOldActual]" ((uint32_t)u8Old)
2035 : "cc");
2036 *pu8Old = (uint8_t)u32Actual;
2037# else
2038 union { uint8_t u; bool f; } fXchg;
2039 uint8_t u8ActualOld;
2040 uint8_t rcSpill;
2041 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU8_%=:\n\t"
2042 RTASM_ARM_DMB_SY
2043# if defined(RT_ARCH_ARM64)
2044 "ldaxrb %w[uOld], %[pMem]\n\t"
2045 "cmp %w[uOld], %w[uCmp]\n\t"
2046 "bne 1f\n\t" /* stop here if not equal */
2047 "stlxrb %w[rc], %w[uNew], %[pMem]\n\t"
2048 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2049 "mov %w[fXchg], #1\n\t"
2050 "1:\n\t"
2051 "clrex\n\t"
2052# else
2053 "ldrexb %[uOld], %[pMem]\n\t"
2054 "teq %[uOld], %[uCmp]\n\t"
2055 "strexbeq %[rc], %[uNew], %[pMem]\n\t"
2056 "bne 1f\n\t" /* stop here if not equal */
2057 "cmp %[rc], #0\n\t"
2058 "bne Ltry_again_ASMAtomicCmpXchgExU8_%=\n\t"
2059 "mov %[fXchg], #1\n\t"
2060 "1:\n\t"
2061 /** @todo clrexne on armv7? */
2062# endif
2063 : [pMem] "+Q" (*pu8)
2064 , [uOld] "=&r" (u8ActualOld)
2065 , [rc] "=&r" (rcSpill)
2066 , [fXchg] "=&r" (fXchg.u)
2067 : [uCmp] "r" (u8Old)
2068 , [uNew] "r" (u8New)
2069 , "[fXchg]" (0)
2070 RTASM_ARM_DMB_SY_COMMA_IN_REG
2071 : "cc");
2072 *pu8Old = u8ActualOld;
2073# endif
2074 return fXchg.f;
2075# endif
2076
2077# else
2078# error "Port me"
2079# endif
2080}
2081#endif
2082
2083
2084/**
2085 * Atomically Compare and Exchange a signed 8-bit value, additionally
2086 * passes back old value, ordered.
2087 *
2088 * @returns true if xchg was done.
2089 * @returns false if xchg wasn't done.
2090 *
2091 * @param pi8 Pointer to the value to update.
2092 * @param i8New The new value to assigned to *pi8.
2093 * @param i8Old The old value to *pi8 compare with.
2094 * @param pi8Old Pointer store the old value at.
2095 *
2096 * @remarks x86: Requires a 486 or later.
2097 */
2098DECLINLINE(bool) ASMAtomicCmpXchgExS8(volatile int8_t RT_FAR *pi8, const int8_t i8New, const int8_t i8Old, int8_t RT_FAR *pi8Old) RT_NOTHROW_DEF
2099{
2100 return ASMAtomicCmpXchgExU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8New, (uint8_t)i8Old, (uint8_t RT_FAR *)pi8Old);
2101}
2102
2103
2104/**
2105 * Atomically Compare and Exchange an unsigned 16-bit value, additionally passes
2106 * back old value, ordered.
2107 *
2108 * @returns true if xchg was done.
2109 * @returns false if xchg wasn't done.
2110 *
2111 * @param pu16 Pointer to the value to update.
2112 * @param u16New The new value to assigned to *pu16.
2113 * @param u16Old The old value to *pu32 compare with.
2114 * @param pu16Old Pointer store the old value at.
2115 *
2116 * @remarks x86: Requires a 486 or later.
2117 */
2118#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2119RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_PROTO;
2120#else
2121DECLINLINE(bool) ASMAtomicCmpXchgExU16(volatile uint16_t RT_FAR *pu16, const uint16_t u16New, const uint16_t u16Old, uint16_t RT_FAR *pu16Old) RT_NOTHROW_DEF
2122{
2123# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2124# if RT_INLINE_ASM_GNU_STYLE
2125 uint8_t u8Ret;
2126 __asm__ __volatile__("lock; cmpxchgw %3, %0\n\t"
2127 "setz %1\n\t"
2128 : "=m" (*pu16)
2129 , "=qm" (u8Ret)
2130 , "=a" (*pu16Old)
2131 : "r" (u16New)
2132 , "a" (u16Old)
2133 , "m" (*pu16)
2134 : "cc");
2135 return (bool)u8Ret;
2136
2137# elif RT_INLINE_ASM_USES_INTRIN
2138 return (*pu16Old = _InterlockedCompareExchange16((short RT_FAR *)pu16, u16New, u16Old)) == u16Old;
2139
2140# else
2141 uint16_t u16Ret;
2142 __asm
2143 {
2144# ifdef RT_ARCH_AMD64
2145 mov rdx, [pu16]
2146# else
2147 mov edx, [pu16]
2148# endif
2149 mov eax, [u16Old]
2150 mov ecx, [u16New]
2151# ifdef RT_ARCH_AMD64
2152 lock cmpxchg [rdx], ecx
2153 mov rdx, [pu16Old]
2154 mov [rdx], eax
2155# else
2156 lock cmpxchg [edx], ecx
2157 mov edx, [pu16Old]
2158 mov [edx], eax
2159# endif
2160 setz al
2161 movzx eax, al
2162 mov [u16Ret], eax
2163 }
2164 return !!u16Ret;
2165# endif
2166
2167# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2168# if RT_INLINE_ASM_USES_INTRIN
2169# if defined(RTASM_ARM64_USE_FEAT_LSE)
2170# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2171 uint16_t const uOldActual = __casal16(pu16, u16Old, u16New);
2172# else
2173 __dmb(_ARM64_BARRIER_SY);
2174 uint16_t const uOldActual = __cas16(pu16, u16Old, u16New);
2175# endif
2176# else
2177 uint16_t const uOldActual = _InterlockedCompareExchange16((char RT_FAR *)pu16, u16New, u16Old);
2178# endif
2179 *pu16Old = uOldActual;
2180 return uOldActual == u16Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2181
2182# else
2183 /* M1 bench: match: casalh= 6577 vs dmb+cash= 1608 vs non-lse=5078 (ps/call)
2184 mismatch: casalh=18791 vs dmb+cash=19721 vs non-lse=2543 (ps/call) */
2185# if defined(RTASM_ARM64_USE_FEAT_LSE)
2186 union { uint32_t u; bool f; } fXchg;
2187 uint32_t u32Actual;
2188 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU16_%=:\n\t"
2189# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2190 "casalh %w[uOldActual], %w[uNew], %[pMem]\n\t"
2191# else
2192 RTASM_ARM_DMB_SY
2193 "cash %w[uOldActual], %w[uNew], %[pMem]\n\t"
2194# endif
2195 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2196 "cset %w[fXchg], eq\n\t"
2197 : [pMem] "+Q" (*pu16)
2198 , [uOldActual] "=&r" (u32Actual)
2199 , [fXchg] "=&r" (fXchg.u)
2200 : [uNew] "r" ((uint32_t)u16New)
2201 , [uOldOrg] "r" ((uint32_t)u16Old)
2202 , "[uOldActual]" ((uint32_t)u16Old)
2203 : "cc");
2204 *pu16Old = (uint16_t)u32Actual;
2205# else
2206 union { uint16_t u; bool f; } fXchg;
2207 uint16_t u16ActualOld;
2208 uint16_t rcSpill;
2209 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU16_%=:\n\t"
2210 RTASM_ARM_DMB_SY
2211# if defined(RT_ARCH_ARM64)
2212 "ldaxrh %w[uOld], %[pMem]\n\t"
2213 "cmp %w[uOld], %w[uCmp]\n\t"
2214 "bne 1f\n\t" /* stop here if not equal */
2215 "stlxrh %w[rc], %w[uNew], %[pMem]\n\t"
2216 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2217 "mov %w[fXchg], #1\n\t"
2218 "1:\n\t"
2219 "clrex\n\t"
2220# else
2221 "ldrexh %[uOld], %[pMem]\n\t"
2222 "teq %[uOld], %[uCmp]\n\t"
2223 "strexheq %[rc], %[uNew], %[pMem]\n\t"
2224 "bne 1f\n\t" /* stop here if not equal */
2225 "cmp %[rc], #0\n\t"
2226 "bne Ltry_again_ASMAtomicCmpXchgExU16_%=\n\t"
2227 "mov %[fXchg], #1\n\t"
2228 "1:\n\t"
2229 /** @todo clrexne on armv7? */
2230# endif
2231 : [pMem] "+Q" (*pu16)
2232 , [uOld] "=&r" (u16ActualOld)
2233 , [rc] "=&r" (rcSpill)
2234 , [fXchg] "=&r" (fXchg.u)
2235 : [uCmp] "r" (u16Old)
2236 , [uNew] "r" (u16New)
2237 , "[fXchg]" (0)
2238 RTASM_ARM_DMB_SY_COMMA_IN_REG
2239 : "cc");
2240 *pu16Old = u16ActualOld;
2241# endif
2242 return fXchg.f;
2243# endif
2244
2245# else
2246# error "Port me"
2247# endif
2248}
2249#endif
2250
2251
2252/**
2253 * Atomically Compare and Exchange a signed 16-bit value, additionally
2254 * passes back old value, ordered.
2255 *
2256 * @returns true if xchg was done.
2257 * @returns false if xchg wasn't done.
2258 *
2259 * @param pi16 Pointer to the value to update.
2260 * @param i16New The new value to assigned to *pi16.
2261 * @param i16Old The old value to *pi16 compare with.
2262 * @param pi16Old Pointer store the old value at.
2263 *
2264 * @remarks x86: Requires a 486 or later.
2265 */
2266DECLINLINE(bool) ASMAtomicCmpXchgExS16(volatile int16_t RT_FAR *pi16, const int16_t i16New, const int16_t i16Old, int16_t RT_FAR *pi16Old) RT_NOTHROW_DEF
2267{
2268 return ASMAtomicCmpXchgExU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16New, (uint16_t)i16Old, (uint16_t RT_FAR *)pi16Old);
2269}
2270
2271
2272/**
2273 * Atomically Compare and Exchange an unsigned 32-bit value, additionally
2274 * passes back old value, ordered.
2275 *
2276 * @returns true if xchg was done.
2277 * @returns false if xchg wasn't done.
2278 *
2279 * @param pu32 Pointer to the value to update.
2280 * @param u32New The new value to assigned to *pu32.
2281 * @param u32Old The old value to *pu32 compare with.
2282 * @param pu32Old Pointer store the old value at.
2283 *
2284 * @remarks x86: Requires a 486 or later.
2285 */
2286#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
2287RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_PROTO;
2288#else
2289DECLINLINE(bool) ASMAtomicCmpXchgExU32(volatile uint32_t RT_FAR *pu32, const uint32_t u32New, const uint32_t u32Old, uint32_t RT_FAR *pu32Old) RT_NOTHROW_DEF
2290{
2291# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2292# if RT_INLINE_ASM_GNU_STYLE
2293 uint8_t u8Ret;
2294 __asm__ __volatile__("lock; cmpxchgl %3, %0\n\t"
2295 "setz %1\n\t"
2296 : "=m" (*pu32)
2297 , "=qm" (u8Ret)
2298 , "=a" (*pu32Old)
2299 : "r" (u32New)
2300 , "a" (u32Old)
2301 , "m" (*pu32)
2302 : "cc");
2303 return (bool)u8Ret;
2304
2305# elif RT_INLINE_ASM_USES_INTRIN
2306 return (*pu32Old = _InterlockedCompareExchange((long RT_FAR *)pu32, u32New, u32Old)) == u32Old;
2307
2308# else
2309 uint32_t u32Ret;
2310 __asm
2311 {
2312# ifdef RT_ARCH_AMD64
2313 mov rdx, [pu32]
2314# else
2315 mov edx, [pu32]
2316# endif
2317 mov eax, [u32Old]
2318 mov ecx, [u32New]
2319# ifdef RT_ARCH_AMD64
2320 lock cmpxchg [rdx], ecx
2321 mov rdx, [pu32Old]
2322 mov [rdx], eax
2323# else
2324 lock cmpxchg [edx], ecx
2325 mov edx, [pu32Old]
2326 mov [edx], eax
2327# endif
2328 setz al
2329 movzx eax, al
2330 mov [u32Ret], eax
2331 }
2332 return !!u32Ret;
2333# endif
2334
2335# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2336# if RT_INLINE_ASM_USES_INTRIN
2337# if defined(RTASM_ARM64_USE_FEAT_LSE)
2338# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2339 uint32_t const uOldActual = __casal32(pu32, u32Old, u32New);
2340# else
2341 __dmb(_ARM64_BARRIER_SY);
2342 uint32_t const uOldActual = __cas32(pu32, u32Old, u32New);
2343# endif
2344# else
2345 uint32_t const uOldActual = _InterlockedCompareExchange((char RT_FAR *)pu32, u32New, u32Old);
2346# endif
2347 *pu32Old = uOldActual;
2348 return uOldActual == u32Old; /* Lets hope the compiler is clever enough to replicate our cmp + cset optimization below. */
2349
2350# else
2351
2352 union { uint32_t u; bool f; } fXchg;
2353 /* M1 bench: match: casal= 6590 vs dmb+cas= 1564 vs non-lse=5033 (ps/call)
2354 mismatch: casal=18790 vs dmb+cas=19711 vs non-lse=2503 (ps/call) */
2355# if defined(RTASM_ARM64_USE_FEAT_LSE)
2356 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2357# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2358 "casal %w[uOldActual], %w[uNew], %[pMem]\n\t"
2359# else
2360 RTASM_ARM_DMB_SY
2361 "cas %w[uOldActual], %w[uNew], %[pMem]\n\t"
2362# endif
2363 "cmp %w[uOldActual], %w[uOldOrg]\n\t"
2364 "cset %w[fXchg], eq\n\t"
2365 : [pMem] "+Q" (*pu32)
2366 , [uOldActual] "=&r" (*pu32Old)
2367 , [fXchg] "=&r" (fXchg.u)
2368 : [uNew] "r" (u32New)
2369 , [uOldOrg] "r" (u32Old)
2370 , "[uOldActual]" (u32Old)
2371 : "cc");
2372# else
2373 uint32_t u32ActualOld;
2374 uint32_t rcSpill;
2375 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgExU32_%=:\n\t"
2376 RTASM_ARM_DMB_SY
2377# if defined(RT_ARCH_ARM64)
2378 "ldaxr %w[uOld], %[pMem]\n\t"
2379 "cmp %w[uOld], %w[uCmp]\n\t"
2380 "bne 1f\n\t" /* stop here if not equal */
2381 "stlxr %w[rc], %w[uNew], %[pMem]\n\t"
2382 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2383 "mov %w[fXchg], #1\n\t"
2384 "1:\n\t"
2385 "clrex\n\t"
2386# else
2387 "ldrex %[uOld], %[pMem]\n\t"
2388 "teq %[uOld], %[uCmp]\n\t"
2389 "strexeq %[rc], %[uNew], %[pMem]\n\t"
2390 "bne 1f\n\t" /* stop here if not equal */
2391 "cmp %[rc], #0\n\t"
2392 "bne Ltry_again_ASMAtomicCmpXchgExU32_%=\n\t"
2393 "mov %[fXchg], #1\n\t"
2394 "1:\n\t"
2395 /** @todo clrexne on armv7? */
2396# endif
2397 : [pMem] "+Q" (*pu32)
2398 , [uOld] "=&r" (u32ActualOld)
2399 , [rc] "=&r" (rcSpill)
2400 , [fXchg] "=&r" (fXchg.u)
2401 : [uCmp] "r" (u32Old)
2402 , [uNew] "r" (u32New)
2403 , "[fXchg]" (0)
2404 RTASM_ARM_DMB_SY_COMMA_IN_REG
2405 : "cc");
2406 *pu32Old = u32ActualOld;
2407# endif
2408 return fXchg.f;
2409# endif
2410
2411# else
2412# error "Port me"
2413# endif
2414}
2415#endif
2416
2417
2418/**
2419 * Atomically Compare and Exchange a signed 32-bit value, additionally
2420 * passes back old value, ordered.
2421 *
2422 * @returns true if xchg was done.
2423 * @returns false if xchg wasn't done.
2424 *
2425 * @param pi32 Pointer to the value to update.
2426 * @param i32New The new value to assigned to *pi32.
2427 * @param i32Old The old value to *pi32 compare with.
2428 * @param pi32Old Pointer store the old value at.
2429 *
2430 * @remarks x86: Requires a 486 or later.
2431 */
2432DECLINLINE(bool) ASMAtomicCmpXchgExS32(volatile int32_t RT_FAR *pi32, const int32_t i32New, const int32_t i32Old, int32_t RT_FAR *pi32Old) RT_NOTHROW_DEF
2433{
2434 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32New, (uint32_t)i32Old, (uint32_t RT_FAR *)pi32Old);
2435}
2436
2437
2438/**
2439 * Atomically Compare and exchange an unsigned 64-bit value, additionally
2440 * passing back old value, ordered.
2441 *
2442 * @returns true if xchg was done.
2443 * @returns false if xchg wasn't done.
2444 *
2445 * @param pu64 Pointer to the 64-bit variable to update.
2446 * @param u64New The 64-bit value to assign to *pu64.
2447 * @param u64Old The value to compare with.
2448 * @param pu64Old Pointer store the old value at.
2449 *
2450 * @remarks x86: Requires a Pentium or later.
2451 */
2452#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
2453 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
2454RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_PROTO;
2455#else
2456DECLINLINE(bool) ASMAtomicCmpXchgExU64(volatile uint64_t RT_FAR *pu64, const uint64_t u64New, const uint64_t u64Old, uint64_t RT_FAR *pu64Old) RT_NOTHROW_DEF
2457{
2458# if RT_INLINE_ASM_USES_INTRIN
2459 return (*pu64Old =_InterlockedCompareExchange64((__int64 RT_FAR *)pu64, u64New, u64Old)) == u64Old;
2460
2461# elif defined(RT_ARCH_AMD64)
2462# if RT_INLINE_ASM_GNU_STYLE
2463 uint8_t u8Ret;
2464 __asm__ __volatile__("lock; cmpxchgq %3, %0\n\t"
2465 "setz %1\n\t"
2466 : "=m" (*pu64)
2467 , "=qm" (u8Ret)
2468 , "=a" (*pu64Old)
2469 : "r" (u64New)
2470 , "a" (u64Old)
2471 , "m" (*pu64)
2472 : "cc");
2473 return (bool)u8Ret;
2474# else
2475 bool fRet;
2476 __asm
2477 {
2478 mov rdx, [pu32]
2479 mov rax, [u64Old]
2480 mov rcx, [u64New]
2481 lock cmpxchg [rdx], rcx
2482 mov rdx, [pu64Old]
2483 mov [rdx], rax
2484 setz al
2485 mov [fRet], al
2486 }
2487 return fRet;
2488# endif
2489
2490# elif defined(RT_ARCH_X86)
2491# if RT_INLINE_ASM_GNU_STYLE
2492 uint64_t u64Ret;
2493# if defined(PIC) || defined(__PIC__)
2494 /* Note #1: This code uses a memory clobber description, because the clean
2495 solution with an output value for *pu64 makes gcc run out of
2496 registers. This will cause suboptimal code, and anyone with a
2497 better solution is welcome to improve this.
2498
2499 Note #2: We must prevent gcc from encoding the memory access, as it
2500 may go via the GOT if we're working on a global variable (like
2501 in the testcase). Thus we request a register (%3) and
2502 dereference it ourselves. */
2503 __asm__ __volatile__("xchgl %%ebx, %1\n\t"
2504 "lock; cmpxchg8b (%3)\n\t"
2505 "xchgl %%ebx, %1\n\t"
2506 : "=A" (u64Ret)
2507 : "DS" ((uint32_t)u64New)
2508 , "c" ((uint32_t)(u64New >> 32))
2509 , "r" (pu64) /* Do not use "m" here*/
2510 , "0" (u64Old)
2511 : "memory"
2512 , "cc" );
2513# else /* !PIC */
2514 __asm__ __volatile__("lock; cmpxchg8b %4\n\t"
2515 : "=A" (u64Ret)
2516 , "=m" (*pu64)
2517 : "b" ((uint32_t)u64New)
2518 , "c" ((uint32_t)(u64New >> 32))
2519 , "m" (*pu64)
2520 , "0" (u64Old)
2521 : "cc");
2522# endif
2523 *pu64Old = u64Ret;
2524 return u64Ret == u64Old;
2525# else
2526 uint32_t u32Ret;
2527 __asm
2528 {
2529 mov ebx, dword ptr [u64New]
2530 mov ecx, dword ptr [u64New + 4]
2531 mov edi, [pu64]
2532 mov eax, dword ptr [u64Old]
2533 mov edx, dword ptr [u64Old + 4]
2534 lock cmpxchg8b [edi]
2535 mov ebx, [pu64Old]
2536 mov [ebx], eax
2537 setz al
2538 movzx eax, al
2539 add ebx, 4
2540 mov [ebx], edx
2541 mov dword ptr [u32Ret], eax
2542 }
2543 return !!u32Ret;
2544# endif
2545
2546# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
2547 union { uint32_t u; bool f; } fXchg;
2548 /* M1 bench: match: casal= 6606 vs dmb+cas= 1565 vs non-lse=5006 (ps/call)
2549 mismatch: casal=18786 vs dmb+cas=19718 vs non-lse=2503 (ps/call) */
2550# if defined(RTASM_ARM64_USE_FEAT_LSE)
2551 __asm__ __volatile__("Lstart_ASMAtomicCmpXchgExU32_%=:\n\t"
2552# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
2553 "casal %[uOldActual], %[uNew], %[pMem]\n\t"
2554# else
2555 RTASM_ARM_DMB_SY
2556 "cas %[uOldActual], %[uNew], %[pMem]\n\t"
2557# endif
2558 "cmp %[uOldActual], %[uOldOrg]\n\t"
2559 "cset %w[fXchg], eq\n\t"
2560 : [pMem] "+Q" (*pu64)
2561 , [uOldActual] "=&r" (*pu64Old)
2562 , [fXchg] "=&r" (fXchg.u)
2563 : [uNew] "r" (u64New)
2564 , [uOldOrg] "r" (u64Old)
2565 , "[uOldActual]" (u64Old)
2566 : "cc");
2567# else
2568 uint64_t u64ActualOld;
2569 uint32_t rcSpill;
2570 __asm__ __volatile__("Ltry_again_ASMAtomicCmpXchgU64_%=:\n\t"
2571 RTASM_ARM_DMB_SY
2572# if defined(RT_ARCH_ARM64)
2573 "ldaxr %[uOld], %[pMem]\n\t"
2574 "cmp %[uOld], %[uCmp]\n\t"
2575 "bne 1f\n\t" /* stop here if not equal */
2576 "stlxr %w[rc], %[uNew], %[pMem]\n\t"
2577 "cbnz %w[rc], Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2578 "mov %w[fXchg], #1\n\t"
2579 "1:\n\t"
2580 "clrex\n\t"
2581# else
2582 "ldrexd %[uOld], %H[uOld], %[pMem]\n\t"
2583 "teq %[uOld], %[uCmp]\n\t"
2584 "teqeq %H[uOld], %H[uCmp]\n\t"
2585 "strexdeq %[rc], %[uNew], %H[uNew], %[pMem]\n\t"
2586 "bne 1f\n\t" /* stop here if not equal */
2587 "cmp %[rc], #0\n\t"
2588 "bne Ltry_again_ASMAtomicCmpXchgU64_%=\n\t"
2589 "mov %[fXchg], #1\n\t"
2590 "1:\n\t"
2591 /** @todo clrexne on armv7? */
2592# endif
2593 : [pMem] "+Q" (*pu64)
2594 , [uOld] "=&r" (u64ActualOld)
2595 , [rc] "=&r" (rcSpill)
2596 , [fXchg] "=&r" (fXchg.u)
2597 : [uCmp] "r" (u64Old)
2598 , [uNew] "r" (u64New)
2599 , "[fXchg]" (0)
2600 RTASM_ARM_DMB_SY_COMMA_IN_REG
2601 : "cc");
2602 *pu64Old = u64ActualOld;
2603# endif
2604 return fXchg.f;
2605
2606# else
2607# error "Port me"
2608# endif
2609}
2610#endif
2611
2612
2613/**
2614 * Atomically Compare and exchange a signed 64-bit value, additionally
2615 * passing back old value, ordered.
2616 *
2617 * @returns true if xchg was done.
2618 * @returns false if xchg wasn't done.
2619 *
2620 * @param pi64 Pointer to the 64-bit variable to update.
2621 * @param i64 The 64-bit value to assign to *pu64.
2622 * @param i64Old The value to compare with.
2623 * @param pi64Old Pointer store the old value at.
2624 *
2625 * @remarks x86: Requires a Pentium or later.
2626 */
2627DECLINLINE(bool) ASMAtomicCmpXchgExS64(volatile int64_t RT_FAR *pi64, const int64_t i64, const int64_t i64Old, int64_t RT_FAR *pi64Old) RT_NOTHROW_DEF
2628{
2629 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64, (uint64_t)i64Old, (uint64_t RT_FAR *)pi64Old);
2630}
2631
2632#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
2633
2634/** @def RTASM_HAVE_CMP_XCHG_U128
2635 * Indicates that we've got ASMAtomicCmpSwapU128(), ASMAtomicCmpSwapU128v2()
2636 * and ASMAtomicCmpSwapExU128() available. */
2637# define RTASM_HAVE_CMP_XCHG_U128 1
2638
2639
2640/**
2641 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2642 *
2643 * @returns true if exchange was done.
2644 * @returns false if exchange wasn't done.
2645 *
2646 * @param pu128 Pointer to the 128-bit variable to update.
2647 * @param u64NewHi The high 64 bits of the value to assign to *pu128.
2648 * @param u64NewLo The low 64 bits of the value to assign to *pu128.
2649 * @param u64OldHi The high 64-bit of the value to compare with.
2650 * @param u64OldLo The low 64-bit of the value to compare with.
2651 * @param pu128Old Where to return the old value.
2652 *
2653 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2654 */
2655# if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN)
2656DECLASM(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2657 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_PROTO;
2658# else
2659DECLINLINE(bool) ASMAtomicCmpXchgU128v2(volatile uint128_t *pu128, const uint64_t u64NewHi, const uint64_t u64NewLo,
2660 const uint64_t u64OldHi, const uint64_t u64OldLo, uint128_t *pu128Old) RT_NOTHROW_DEF
2661{
2662# if RT_INLINE_ASM_USES_INTRIN
2663 pu128Old->Hi = u64OldHi;
2664 pu128Old->Lo = u64OldLo;
2665 AssertCompileMemberOffset(uint128_t, Lo, 0);
2666 return _InterlockedCompareExchange128((__int64 volatile *)pu128, u64NewHi, u64NewLo, (__int64 *)&pu128Old->Lo) != 0;
2667
2668# elif (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2669 uint128_t const uCmp = ((uint128_t)u64OldHi << 64) | u64OldLo;
2670 uint128_t const uOld = __sync_val_compare_and_swap(pu128, uCmp, ((uint128_t)u64NewHi << 64) | u64NewLo);
2671 *pu128Old = uOld;
2672 return uCmp == uOld;
2673
2674# elif defined(RT_ARCH_AMD64)
2675# if RT_INLINE_ASM_GNU_STYLE
2676 uint8_t bRet;
2677 uint64_t u64RetHi, u64RetLo;
2678 __asm__ __volatile__("lock; cmpxchg16b %3\n\t"
2679 "setz %b0\n\t"
2680 : "=r" (bRet)
2681 , "=a" (u64RetLo)
2682 , "=d" (u64RetHi)
2683 , "+m" (*pu128)
2684 : "a" (u64OldLo)
2685 , "d" (u64OldHi)
2686 , "b" (u64NewLo)
2687 , "c" (u64NewHi)
2688 : "cc");
2689 *pu128Old = ((uint128_t)u64RetHi << 64) | u64RetLo;
2690 return (bool)bRet;
2691# else
2692# error "Port me"
2693# endif
2694# else
2695# error "Port me"
2696# endif
2697}
2698# endif
2699
2700
2701/**
2702 * Atomically compare and exchange an unsigned 128-bit value, ordered.
2703 *
2704 * @returns true if exchange was done.
2705 * @returns false if exchange wasn't done.
2706 *
2707 * @param pu128 Pointer to the 128-bit variable to update.
2708 * @param u128New The 128-bit value to assign to *pu128.
2709 * @param u128Old The value to compare with.
2710 * @param pu128Old Where to return the old value.
2711 *
2712 * @remarks AMD64: Not present in the earliest CPUs, so check CPUID.
2713 */
2714DECLINLINE(bool) ASMAtomicCmpXchgU128(volatile uint128_t *pu128, const uint128_t u128New,
2715 const uint128_t u128Old, uint128_t *pu128Old) RT_NOTHROW_DEF
2716{
2717# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
2718# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2719 uint128_t const uSwapped = __sync_val_compare_and_swap(pu128, u128Old, u128New);
2720 *pu128Old = uSwapped;
2721 return uSwapped == u128Old;
2722# else
2723 return ASMAtomicCmpXchgU128v2(pu128, (uint64_t)(u128New >> 64), (uint64_t)u128New,
2724 (uint64_t)(u128Old >> 64), (uint64_t)u128Old, pu128Old);
2725# endif
2726# else
2727 return ASMAtomicCmpXchgU128v2(pu128, u128New.Hi, u128New.Lo, u128Old.Hi, u128Old.Lo, pu128Old);
2728# endif
2729}
2730
2731
2732/**
2733 * RTUINT128U wrapper for ASMAtomicCmpXchgU128.
2734 */
2735DECLINLINE(bool) ASMAtomicCmpXchgU128U(volatile RTUINT128U *pu128, const RTUINT128U u128New,
2736 const RTUINT128U u128Old, PRTUINT128U pu128Old) RT_NOTHROW_DEF
2737{
2738# if (defined(__clang_major__) || defined(__GNUC__)) && defined(RT_ARCH_ARM64)
2739 return ASMAtomicCmpXchgU128(&pu128->u, u128New.u, u128Old.u, &pu128Old->u);
2740# else
2741 return ASMAtomicCmpXchgU128v2(&pu128->u, u128New.s.Hi, u128New.s.Lo, u128Old.s.Hi, u128Old.s.Lo, &pu128Old->u);
2742# endif
2743}
2744
2745#endif /* RT_ARCH_AMD64 || RT_ARCH_ARM64 */
2746
2747
2748
2749/** @def ASMAtomicCmpXchgExHandle
2750 * Atomically Compare and Exchange a typical IPRT handle value, ordered.
2751 *
2752 * @param ph Pointer to the value to update.
2753 * @param hNew The new value to assigned to *pu.
2754 * @param hOld The old value to *pu compare with.
2755 * @param fRc Where to store the result.
2756 * @param phOldVal Pointer to where to store the old value.
2757 *
2758 * @remarks This doesn't currently work for all handles (like RTFILE).
2759 */
2760#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
2761# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2762 do { \
2763 AssertCompile(sizeof(*ph) == sizeof(uint32_t)); \
2764 AssertCompile(sizeof(*phOldVal) == sizeof(uint32_t)); \
2765 (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(ph), (uint32_t)(hNew), (uint32_t)(hOld), (uint32_t RT_FAR *)(phOldVal)); \
2766 } while (0)
2767#elif HC_ARCH_BITS == 64
2768# define ASMAtomicCmpXchgExHandle(ph, hNew, hOld, fRc, phOldVal) \
2769 do { \
2770 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
2771 AssertCompile(sizeof(*(phOldVal)) == sizeof(uint64_t)); \
2772 (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(ph), (uint64_t)(hNew), (uint64_t)(hOld), (uint64_t RT_FAR *)(phOldVal)); \
2773 } while (0)
2774#else
2775# error HC_ARCH_BITS
2776#endif
2777
2778
2779/** @def ASMAtomicCmpXchgExSize
2780 * Atomically Compare and Exchange a value which size might differ
2781 * between platforms or compilers. Additionally passes back old value.
2782 *
2783 * @param pu Pointer to the value to update.
2784 * @param uNew The new value to assigned to *pu.
2785 * @param uOld The old value to *pu compare with.
2786 * @param fRc Where to store the result.
2787 * @param puOldVal Pointer to where to store the old value.
2788 *
2789 * @remarks x86: Requires a 486 or later.
2790 */
2791#define ASMAtomicCmpXchgExSize(pu, uNew, uOld, fRc, puOldVal) \
2792 do { \
2793 switch (sizeof(*(pu))) { \
2794 case 4: (fRc) = ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew), (uint32_t)(uOld), (uint32_t RT_FAR *)(uOldVal)); \
2795 break; \
2796 case 8: (fRc) = ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew), (uint64_t)(uOld), (uint64_t RT_FAR *)(uOldVal)); \
2797 break; \
2798 default: AssertMsgFailed(("ASMAtomicCmpXchgSize: size %d is not supported\n", sizeof(*(pu)))); \
2799 (fRc) = false; \
2800 (uOldVal) = 0; \
2801 break; \
2802 } \
2803 } while (0)
2804
2805
2806/**
2807 * Atomically Compare and Exchange a pointer value, additionally
2808 * passing back old value, ordered.
2809 *
2810 * @returns true if xchg was done.
2811 * @returns false if xchg wasn't done.
2812 *
2813 * @param ppv Pointer to the value to update.
2814 * @param pvNew The new value to assigned to *ppv.
2815 * @param pvOld The old value to *ppv compare with.
2816 * @param ppvOld Pointer store the old value at.
2817 *
2818 * @remarks x86: Requires a 486 or later.
2819 */
2820DECLINLINE(bool) ASMAtomicCmpXchgExPtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void RT_FAR *pvNew, const void RT_FAR *pvOld,
2821 void RT_FAR * RT_FAR *ppvOld) RT_NOTHROW_DEF
2822{
2823#if ARCH_BITS == 32 || ARCH_BITS == 16
2824 return ASMAtomicCmpXchgExU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pvNew, (uint32_t)pvOld, (uint32_t RT_FAR *)ppvOld);
2825#elif ARCH_BITS == 64
2826 return ASMAtomicCmpXchgExU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pvNew, (uint64_t)pvOld, (uint64_t RT_FAR *)ppvOld);
2827#else
2828# error "ARCH_BITS is bogus"
2829#endif
2830}
2831
2832
2833/**
2834 * Atomically Compare and Exchange a pointer value, additionally
2835 * passing back old value, ordered.
2836 *
2837 * @returns true if xchg was done.
2838 * @returns false if xchg wasn't done.
2839 *
2840 * @param ppv Pointer to the value to update.
2841 * @param pvNew The new value to assigned to *ppv.
2842 * @param pvOld The old value to *ppv compare with.
2843 * @param ppvOld Pointer store the old value at.
2844 *
2845 * @remarks This is relatively type safe on GCC platforms.
2846 * @remarks x86: Requires a 486 or later.
2847 */
2848#ifdef __GNUC__
2849# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2850 __extension__ \
2851 ({\
2852 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
2853 __typeof__(*(ppv)) const pvNewTypeChecked = (pvNew); \
2854 __typeof__(*(ppv)) const pvOldTypeChecked = (pvOld); \
2855 __typeof__(*(ppv)) * const ppvOldTypeChecked = (ppvOld); \
2856 bool fMacroRet = ASMAtomicCmpXchgExPtrVoid((void * volatile *)ppvTypeChecked, \
2857 (void *)pvNewTypeChecked, (void *)pvOldTypeChecked, \
2858 (void **)ppvOldTypeChecked); \
2859 fMacroRet; \
2860 })
2861#else
2862# define ASMAtomicCmpXchgExPtr(ppv, pvNew, pvOld, ppvOld) \
2863 ASMAtomicCmpXchgExPtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pvNew), (void RT_FAR *)(pvOld), (void RT_FAR * RT_FAR *)(ppvOld))
2864#endif
2865
2866
2867/**
2868 * Virtualization unfriendly serializing instruction, always exits.
2869 */
2870#if (RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2871RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_PROTO;
2872#else
2873DECLINLINE(void) ASMSerializeInstructionCpuId(void) RT_NOTHROW_DEF
2874{
2875# if RT_INLINE_ASM_GNU_STYLE
2876 RTCCUINTREG xAX = 0;
2877# ifdef RT_ARCH_AMD64
2878 __asm__ __volatile__ ("cpuid"
2879 : "=a" (xAX)
2880 : "0" (xAX)
2881 : "rbx", "rcx", "rdx", "memory");
2882# elif (defined(PIC) || defined(__PIC__)) && defined(__i386__)
2883 __asm__ __volatile__ ("push %%ebx\n\t"
2884 "cpuid\n\t"
2885 "pop %%ebx\n\t"
2886 : "=a" (xAX)
2887 : "0" (xAX)
2888 : "ecx", "edx", "memory");
2889# else
2890 __asm__ __volatile__ ("cpuid"
2891 : "=a" (xAX)
2892 : "0" (xAX)
2893 : "ebx", "ecx", "edx", "memory");
2894# endif
2895
2896# elif RT_INLINE_ASM_USES_INTRIN
2897 int aInfo[4];
2898 _ReadWriteBarrier();
2899 __cpuid(aInfo, 0);
2900
2901# else
2902 __asm
2903 {
2904 push ebx
2905 xor eax, eax
2906 cpuid
2907 pop ebx
2908 }
2909# endif
2910}
2911#endif
2912
2913/**
2914 * Virtualization friendly serializing instruction, though more expensive.
2915 */
2916#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2917RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_PROTO;
2918#else
2919DECLINLINE(void) ASMSerializeInstructionIRet(void) RT_NOTHROW_DEF
2920{
2921# if RT_INLINE_ASM_GNU_STYLE
2922# ifdef RT_ARCH_AMD64
2923 __asm__ __volatile__ ("movq %%rsp,%%r10\n\t"
2924 "subq $128, %%rsp\n\t" /*redzone*/
2925 "mov %%ss, %%eax\n\t"
2926 "pushq %%rax\n\t"
2927 "pushq %%r10\n\t"
2928 "pushfq\n\t"
2929 "movl %%cs, %%eax\n\t"
2930 "pushq %%rax\n\t"
2931 "leaq 1f(%%rip), %%rax\n\t"
2932 "pushq %%rax\n\t"
2933 "iretq\n\t"
2934 "1:\n\t"
2935 ::: "rax", "r10", "memory", "cc");
2936# else
2937 __asm__ __volatile__ ("pushfl\n\t"
2938 "pushl %%cs\n\t"
2939 "pushl $1f\n\t"
2940 "iretl\n\t"
2941 "1:\n\t"
2942 ::: "memory");
2943# endif
2944
2945# else
2946 __asm
2947 {
2948 pushfd
2949 push cs
2950 push la_ret
2951 iretd
2952 la_ret:
2953 }
2954# endif
2955}
2956#endif
2957
2958/**
2959 * Virtualization friendlier serializing instruction, may still cause exits.
2960 */
2961#if (RT_INLINE_ASM_EXTERNAL && RT_INLINE_ASM_USES_INTRIN < RT_MSC_VER_VS2008) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
2962RT_ASM_DECL_PRAGMA_WATCOM(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_PROTO;
2963#else
2964DECLINLINE(void) ASMSerializeInstructionRdTscp(void) RT_NOTHROW_DEF
2965{
2966# if RT_INLINE_ASM_GNU_STYLE
2967 /* rdtscp is not supported by ancient linux build VM of course :-( */
2968# ifdef RT_ARCH_AMD64
2969 /*__asm__ __volatile__("rdtscp\n\t" ::: "rax", "rdx, "rcx"); */
2970 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "rax", "rdx", "rcx", "memory");
2971# else
2972 /*__asm__ __volatile__("rdtscp\n\t" ::: "eax", "edx, "ecx"); */
2973 __asm__ __volatile__(".byte 0x0f,0x01,0xf9\n\t" ::: "eax", "edx", "ecx", "memory");
2974# endif
2975# else
2976# if RT_INLINE_ASM_USES_INTRIN >= RT_MSC_VER_VS2008
2977 uint32_t uIgnore;
2978 _ReadWriteBarrier();
2979 (void)__rdtscp(&uIgnore);
2980 (void)uIgnore;
2981# else
2982 __asm
2983 {
2984 rdtscp
2985 }
2986# endif
2987# endif
2988}
2989#endif
2990
2991
2992/**
2993 * Serialize Instruction (both data store and instruction flush).
2994 */
2995#if (defined(RT_ARCH_X86) && ARCH_BITS == 16) || defined(IN_GUEST)
2996# define ASMSerializeInstruction() ASMSerializeInstructionIRet()
2997#elif defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
2998# define ASMSerializeInstruction() ASMSerializeInstructionCpuId()
2999#elif defined(RT_ARCH_SPARC64)
3000RTDECL(void) ASMSerializeInstruction(void) RT_NOTHROW_PROTO;
3001#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3002DECLINLINE(void) ASMSerializeInstruction(void) RT_NOTHROW_DEF
3003{
3004# if RT_INLINE_ASM_USES_INTRIN
3005 __dsb(_ARM64_BARRIER_SY);
3006# else
3007 __asm__ __volatile__ (RTASM_ARM_DSB_SY :: RTASM_ARM_DSB_SY_IN_REG :);
3008# endif
3009}
3010#else
3011# error "Port me"
3012#endif
3013
3014
3015/**
3016 * Memory fence, waits for any pending writes and reads to complete.
3017 * @note No implicit compiler barrier (which is probably stupid).
3018 */
3019DECLINLINE(void) ASMMemoryFence(void) RT_NOTHROW_DEF
3020{
3021#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3022# if RT_INLINE_ASM_GNU_STYLE
3023 __asm__ __volatile__ (".byte 0x0f,0xae,0xf0\n\t");
3024# elif RT_INLINE_ASM_USES_INTRIN
3025 _mm_mfence();
3026# else
3027 __asm
3028 {
3029 _emit 0x0f
3030 _emit 0xae
3031 _emit 0xf0
3032 }
3033# endif
3034#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3035# if RT_INLINE_ASM_USES_INTRIN
3036 __dmb(_ARM64_BARRIER_SY);
3037# else
3038 __asm__ __volatile__ (RTASM_ARM_DMB_SY :: RTASM_ARM_DMB_SY_IN_REG :);
3039# endif
3040#elif ARCH_BITS == 16
3041 uint16_t volatile u16;
3042 ASMAtomicXchgU16(&u16, 0);
3043#else
3044 uint32_t volatile u32;
3045 ASMAtomicXchgU32(&u32, 0);
3046#endif
3047}
3048
3049
3050/**
3051 * Write fence, waits for any pending writes to complete.
3052 * @note No implicit compiler barrier (which is probably stupid).
3053 */
3054DECLINLINE(void) ASMWriteFence(void) RT_NOTHROW_DEF
3055{
3056#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3057# if RT_INLINE_ASM_GNU_STYLE
3058 __asm__ __volatile__ (".byte 0x0f,0xae,0xf8\n\t");
3059# elif RT_INLINE_ASM_USES_INTRIN
3060 _mm_sfence();
3061# else
3062 __asm
3063 {
3064 _emit 0x0f
3065 _emit 0xae
3066 _emit 0xf8
3067 }
3068# endif
3069#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3070# if RT_INLINE_ASM_USES_INTRIN
3071 __dmb(_ARM64_BARRIER_ST);
3072# else
3073 __asm__ __volatile__ (RTASM_ARM_DMB_ST :: RTASM_ARM_DMB_ST_IN_REG :);
3074# endif
3075#else
3076 ASMMemoryFence();
3077#endif
3078}
3079
3080
3081/**
3082 * Read fence, waits for any pending reads to complete.
3083 * @note No implicit compiler barrier (which is probably stupid).
3084 */
3085DECLINLINE(void) ASMReadFence(void) RT_NOTHROW_DEF
3086{
3087#if defined(RT_ARCH_AMD64) || (defined(RT_ARCH_X86) && !defined(RT_WITH_OLD_CPU_SUPPORT))
3088# if RT_INLINE_ASM_GNU_STYLE
3089 __asm__ __volatile__ (".byte 0x0f,0xae,0xe8\n\t");
3090# elif RT_INLINE_ASM_USES_INTRIN
3091 _mm_lfence();
3092# else
3093 __asm
3094 {
3095 _emit 0x0f
3096 _emit 0xae
3097 _emit 0xe8
3098 }
3099# endif
3100#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3101# if RT_INLINE_ASM_USES_INTRIN
3102 __dmb(_ARM64_BARRIER_LD);
3103# else
3104 __asm__ __volatile__ (RTASM_ARM_DMB_LD :: RTASM_ARM_DMB_LD_IN_REG :);
3105# endif
3106#else
3107 ASMMemoryFence();
3108#endif
3109}
3110
3111
3112/**
3113 * Atomically reads an unsigned 8-bit value, ordered.
3114 *
3115 * @returns Current *pu8 value
3116 * @param pu8 Pointer to the 8-bit variable to read.
3117 */
3118DECLINLINE(uint8_t) ASMAtomicReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3119{
3120#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3121# if RT_INLINE_ASM_USES_INTRIN
3122 return __load_acquire8(pu8);
3123
3124# else
3125 /** @todo check out using ldarb (like __load_acquire8). */
3126 uint32_t u32;
3127# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1 */
3128 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3129 RTASM_ARM_DMB_SY
3130 "casab %w[uDst], wzr, %[pMem]\n\t"
3131 : [uDst] "=&r" (u32)
3132 : [pMem] "Q" (*pu8),
3133 "0" (0)
3134 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3135# else
3136 __asm__ __volatile__("Lstart_ASMAtomicReadU8_%=:\n\t"
3137 RTASM_ARM_DMB_SY
3138# if defined(RT_ARCH_ARM64)
3139# if 1 /* shouldn't be any need for more than single-copy atomicity when we've got a proper barrier, just like on x86. */
3140 "ldurb %w[uDst], %[pMem]\n\t"
3141# else
3142 "ldxrb %w[uDst], %[pMem]\n\t"
3143 "clrex\n\t"
3144# endif
3145# else
3146 "ldrexb %[uDst], %[pMem]\n\t"
3147 /** @todo clrex */
3148# endif
3149 : [uDst] "=&r" (u32)
3150 : [pMem] "Q" (*pu8)
3151 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3152# endif
3153 return (uint8_t)u32;
3154# endif
3155
3156#else
3157 ASMMemoryFence();
3158 return *pu8; /* byte reads are atomic on x86 */
3159#endif
3160}
3161
3162
3163/**
3164 * Atomically reads an unsigned 8-bit value, unordered.
3165 *
3166 * @returns Current *pu8 value
3167 * @param pu8 Pointer to the 8-bit variable to read.
3168 */
3169DECLINLINE(uint8_t) ASMAtomicUoReadU8(volatile uint8_t RT_FAR *pu8) RT_NOTHROW_DEF
3170{
3171#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3172# if RT_INLINE_ASM_USES_INTRIN
3173 return (uint8_t)__iso_volatile_load8((volatile char *)pu8); /* (emits ldrsb, sign-extending it to 32-bit) */
3174
3175# else
3176 uint32_t u32;
3177 __asm__ __volatile__("Lstart_ASMAtomicUoReadU8_%=:\n\t"
3178# if defined(RT_ARCH_ARM64)
3179 "ldurb %w[uDst], %[pMem]\n\t"
3180# else
3181 "ldrexb %[uDst], %[pMem]\n\t" /** @todo fix this */
3182# endif
3183 : [uDst] "=&r" (u32)
3184 : [pMem] "Q" (*pu8));
3185 return (uint8_t)u32;
3186# endif
3187
3188#else
3189 return *pu8; /* byte reads are atomic on x86 */
3190#endif
3191}
3192
3193
3194/**
3195 * Atomically reads a signed 8-bit value, ordered.
3196 *
3197 * @returns Current *pi8 value
3198 * @param pi8 Pointer to the 8-bit variable to read.
3199 */
3200DECLINLINE(int8_t) ASMAtomicReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3201{
3202#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3203 return (int8_t)ASMAtomicReadU8((volatile uint8_t RT_FAR *)pi8);
3204#else
3205 ASMMemoryFence();
3206 return *pi8; /* byte reads are atomic on x86 */
3207#endif
3208}
3209
3210
3211/**
3212 * Atomically reads a signed 8-bit value, unordered.
3213 *
3214 * @returns Current *pi8 value
3215 * @param pi8 Pointer to the 8-bit variable to read.
3216 */
3217DECLINLINE(int8_t) ASMAtomicUoReadS8(volatile int8_t RT_FAR *pi8) RT_NOTHROW_DEF
3218{
3219#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3220# if RT_INLINE_ASM_USES_INTRIN
3221 return __iso_volatile_load8((volatile const char *)pi8);
3222
3223# else
3224 int32_t i32;
3225 __asm__ __volatile__("Lstart_ASMAtomicUoReadS8_%=:\n\t"
3226# if defined(RT_ARCH_ARM64)
3227 "ldurb %w[iDst], %[pMem]\n\t"
3228# else
3229 "ldrexb %[iDst], %[pMem]\n\t" /** @todo fix this */
3230# endif
3231 : [iDst] "=&r" (i32)
3232 : [pMem] "Q" (*pi8));
3233 return (int8_t)i32;
3234# endif
3235
3236#else
3237 return *pi8; /* byte reads are atomic on x86 */
3238#endif
3239}
3240
3241
3242/**
3243 * Atomically reads an unsigned 16-bit value, ordered.
3244 *
3245 * @returns Current *pu16 value
3246 * @param pu16 Pointer to the 16-bit variable to read.
3247 */
3248DECLINLINE(uint16_t) ASMAtomicReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3249{
3250 Assert(!((uintptr_t)pu16 & 1));
3251#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3252# if RT_INLINE_ASM_USES_INTRIN
3253 return __load_acquire16(pu16);
3254
3255# else
3256 /** @todo check out using ldarh (like __load_acquire16). */
3257 uint32_t u32;
3258# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3259 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3260 RTASM_ARM_DMB_SY
3261 "casah %w[uDst], wzr, %[pMem]\n\t"
3262 : [uDst] "=&r" (u32)
3263 : [pMem] "Q" (*pu16),
3264 "0" (0)
3265 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3266# else
3267 __asm__ __volatile__("Lstart_ASMAtomicReadU16_%=:\n\t"
3268 RTASM_ARM_DMB_SY
3269# if defined(RT_ARCH_ARM64)
3270# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3271 "ldurh %w[uDst], %[pMem]\n\t"
3272# else
3273 "ldxrh %w[uDst], %[pMem]\n\t"
3274 "clrex\n\t"
3275# endif
3276# else
3277 "ldrexh %[uDst], %[pMem]\n\t"
3278 /** @todo clrex */
3279# endif
3280 : [uDst] "=&r" (u32)
3281 : [pMem] "Q" (*pu16)
3282 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3283# endif
3284 return (uint16_t)u32;
3285# endif
3286
3287#else
3288 ASMMemoryFence();
3289 return *pu16;
3290#endif
3291}
3292
3293
3294/**
3295 * Atomically reads an unsigned 16-bit value, unordered.
3296 *
3297 * @returns Current *pu16 value
3298 * @param pu16 Pointer to the 16-bit variable to read.
3299 */
3300DECLINLINE(uint16_t) ASMAtomicUoReadU16(volatile uint16_t RT_FAR *pu16) RT_NOTHROW_DEF
3301{
3302 Assert(!((uintptr_t)pu16 & 1));
3303#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3304# if RT_INLINE_ASM_USES_INTRIN
3305 return (uint16_t)__iso_volatile_load16((volatile int16_t *)pu16); /* (emits ldrsh, sign-extending it to 32-bit) */
3306
3307# else
3308 uint32_t u32;
3309 __asm__ __volatile__("Lstart_ASMAtomicUoReadU16_%=:\n\t"
3310# if defined(RT_ARCH_ARM64)
3311 "ldurh %w[uDst], %[pMem]\n\t"
3312# else
3313 "ldrexh %[uDst], %[pMem]\n\t" /** @todo fix this */
3314# endif
3315 : [uDst] "=&r" (u32)
3316 : [pMem] "Q" (*pu16));
3317 return (uint16_t)u32;
3318# endif
3319
3320#else
3321 return *pu16;
3322#endif
3323}
3324
3325
3326/**
3327 * Atomically reads a signed 16-bit value, ordered.
3328 *
3329 * @returns Current *pi16 value
3330 * @param pi16 Pointer to the 16-bit variable to read.
3331 */
3332DECLINLINE(int16_t) ASMAtomicReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3333{
3334 Assert(!((uintptr_t)pi16 & 1));
3335#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3336 return (int16_t)ASMAtomicReadU16((volatile uint16_t RT_FAR *)pi16);
3337#else
3338 ASMMemoryFence();
3339 return *pi16;
3340#endif
3341}
3342
3343
3344/**
3345 * Atomically reads a signed 16-bit value, unordered.
3346 *
3347 * @returns Current *pi16 value
3348 * @param pi16 Pointer to the 16-bit variable to read.
3349 */
3350DECLINLINE(int16_t) ASMAtomicUoReadS16(volatile int16_t RT_FAR *pi16) RT_NOTHROW_DEF
3351{
3352 Assert(!((uintptr_t)pi16 & 1));
3353#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3354# if RT_INLINE_ASM_USES_INTRIN
3355 return __iso_volatile_load16(pi16);
3356
3357# else
3358 int32_t i32;
3359 __asm__ __volatile__("Lstart_ASMAtomicUoReadS16_%=:\n\t"
3360# if defined(RT_ARCH_ARM64)
3361 "ldurh %w[iDst], %[pMem]\n\t"
3362# else
3363 "ldrexh %[iDst], %[pMem]\n\t" /** @todo fix this */
3364# endif
3365 : [iDst] "=&r" (i32)
3366 : [pMem] "Q" (*pi16));
3367 return (int16_t)i32;
3368# endif
3369
3370#else
3371 return *pi16;
3372#endif
3373}
3374
3375
3376/**
3377 * Atomically reads an unsigned 32-bit value, ordered.
3378 *
3379 * @returns Current *pu32 value
3380 * @param pu32 Pointer to the 32-bit variable to read.
3381 */
3382DECLINLINE(uint32_t) ASMAtomicReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3383{
3384 Assert(!((uintptr_t)pu32 & 3));
3385#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3386# if RT_INLINE_ASM_USES_INTRIN
3387 return (uint32_t)__load_acquire32(pu32);
3388
3389# else
3390 /** @todo check out using ldar (like __load_acquire32). */
3391 uint32_t u32;
3392# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3393 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3394 RTASM_ARM_DMB_SY
3395 "casa %w[uDst], wzr, %[pMem]\n\t"
3396 : [uDst] "=&r" (u32)
3397 : [pMem] "Q" (*pu32),
3398 "0" (0)
3399 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3400# else
3401 __asm__ __volatile__("Lstart_ASMAtomicReadU32_%=:\n\t"
3402 RTASM_ARM_DMB_SY
3403# if defined(RT_ARCH_ARM64)
3404# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3405 "ldur %w[uDst], %[pMem]\n\t"
3406# else
3407 "ldxr %w[uDst], %[pMem]\n\t"
3408 "clrex\n\t"
3409# endif
3410# else
3411 "ldrex %[uDst], %[pMem]\n\t"
3412 /** @todo clrex */
3413# endif
3414 : [uDst] "=&r" (u32)
3415 : [pMem] "Q" (*pu32)
3416 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3417# endif
3418 return u32;
3419# endif
3420
3421#else
3422 ASMMemoryFence();
3423# if ARCH_BITS == 16
3424 AssertFailed(); /** @todo 16-bit */
3425# endif
3426 return *pu32;
3427#endif
3428}
3429
3430
3431/**
3432 * Atomically reads an unsigned 32-bit value, unordered.
3433 *
3434 * @returns Current *pu32 value
3435 * @param pu32 Pointer to the 32-bit variable to read.
3436 */
3437DECLINLINE(uint32_t) ASMAtomicUoReadU32(volatile uint32_t RT_FAR *pu32) RT_NOTHROW_DEF
3438{
3439 Assert(!((uintptr_t)pu32 & 3));
3440#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3441# if RT_INLINE_ASM_USES_INTRIN
3442 return (uint32_t)__iso_volatile_load32((volatile int32_t *)pu32);
3443
3444# else
3445 uint32_t u32;
3446 __asm__ __volatile__("Lstart_ASMAtomicUoReadU32_%=:\n\t"
3447# if defined(RT_ARCH_ARM64)
3448 "ldur %w[uDst], %[pMem]\n\t"
3449# else
3450 "ldrex %[uDst], %[pMem]\n\t" /** @todo fix this */
3451# endif
3452 : [uDst] "=&r" (u32)
3453 : [pMem] "Q" (*pu32));
3454 return u32;
3455# endif
3456
3457#else
3458# if ARCH_BITS == 16
3459 AssertFailed(); /** @todo 16-bit */
3460# endif
3461 return *pu32;
3462#endif
3463}
3464
3465
3466/**
3467 * Atomically reads a signed 32-bit value, ordered.
3468 *
3469 * @returns Current *pi32 value
3470 * @param pi32 Pointer to the 32-bit variable to read.
3471 */
3472DECLINLINE(int32_t) ASMAtomicReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3473{
3474 Assert(!((uintptr_t)pi32 & 3));
3475#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3476 return (int32_t)ASMAtomicReadU32((volatile uint32_t RT_FAR *)pi32);
3477#else
3478 ASMMemoryFence();
3479# if ARCH_BITS == 16
3480 AssertFailed(); /** @todo 16-bit */
3481# endif
3482 return *pi32;
3483#endif
3484}
3485
3486
3487/**
3488 * Atomically reads a signed 32-bit value, unordered.
3489 *
3490 * @returns Current *pi32 value
3491 * @param pi32 Pointer to the 32-bit variable to read.
3492 */
3493DECLINLINE(int32_t) ASMAtomicUoReadS32(volatile int32_t RT_FAR *pi32) RT_NOTHROW_DEF
3494{
3495 Assert(!((uintptr_t)pi32 & 3));
3496#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3497# if RT_INLINE_ASM_USES_INTRIN
3498 return __iso_volatile_load32(pi32);
3499
3500# else
3501 int32_t i32;
3502 __asm__ __volatile__("Lstart_ASMAtomicUoReadS32_%=:\n\t"
3503# if defined(RT_ARCH_ARM64)
3504 "ldur %w[iDst], %[pMem]\n\t"
3505# else
3506 "ldrex %[iDst], %[pMem]\n\t" /** @todo thix this */
3507# endif
3508 : [iDst] "=&r" (i32)
3509 : [pMem] "Q" (*pi32));
3510 return i32;
3511# endif
3512
3513#else
3514# if ARCH_BITS == 16
3515 AssertFailed(); /** @todo 16-bit */
3516# endif
3517 return *pi32;
3518#endif
3519}
3520
3521
3522/**
3523 * Atomically reads an unsigned 64-bit value, ordered.
3524 *
3525 * @returns Current *pu64 value
3526 * @param pu64 Pointer to the 64-bit variable to read.
3527 * The memory pointed to must be writable.
3528 *
3529 * @remarks This may fault if the memory is read-only!
3530 * @remarks x86: Requires a Pentium or later.
3531 */
3532#if (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !defined(RT_ARCH_AMD64)) \
3533 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC
3534RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3535#else
3536DECLINLINE(uint64_t) ASMAtomicReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3537{
3538 uint64_t u64;
3539# ifdef RT_ARCH_AMD64
3540 Assert(!((uintptr_t)pu64 & 7));
3541/*# if RT_INLINE_ASM_GNU_STYLE
3542 __asm__ __volatile__( "mfence\n\t"
3543 "movq %1, %0\n\t"
3544 : "=r" (u64)
3545 : "m" (*pu64));
3546# else
3547 __asm
3548 {
3549 mfence
3550 mov rdx, [pu64]
3551 mov rax, [rdx]
3552 mov [u64], rax
3553 }
3554# endif*/
3555 ASMMemoryFence();
3556 u64 = *pu64;
3557
3558# elif defined(RT_ARCH_X86)
3559# if RT_INLINE_ASM_GNU_STYLE
3560# if defined(PIC) || defined(__PIC__)
3561 uint32_t u32EBX = 0;
3562 Assert(!((uintptr_t)pu64 & 7));
3563 __asm__ __volatile__("xchgl %%ebx, %3\n\t"
3564 "lock; cmpxchg8b (%5)\n\t"
3565 "movl %3, %%ebx\n\t"
3566 : "=A" (u64)
3567# if RT_GNUC_PREREQ(4, 3)
3568 , "+m" (*pu64)
3569# else
3570 , "=m" (*pu64)
3571# endif
3572 : "0" (0ULL)
3573 , "m" (u32EBX)
3574 , "c" (0)
3575 , "S" (pu64)
3576 : "cc");
3577# else /* !PIC */
3578 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3579 : "=A" (u64)
3580 , "+m" (*pu64)
3581 : "0" (0ULL)
3582 , "b" (0)
3583 , "c" (0)
3584 : "cc");
3585# endif
3586# else
3587 Assert(!((uintptr_t)pu64 & 7));
3588 __asm
3589 {
3590 xor eax, eax
3591 xor edx, edx
3592 mov edi, pu64
3593 xor ecx, ecx
3594 xor ebx, ebx
3595 lock cmpxchg8b [edi]
3596 mov dword ptr [u64], eax
3597 mov dword ptr [u64 + 4], edx
3598 }
3599# endif
3600
3601# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3602 Assert(!((uintptr_t)pu64 & 7));
3603
3604# if RT_INLINE_ASM_USES_INTRIN
3605 u64 = (uint64_t)__load_acquire64(pu64);
3606
3607# else
3608 /** @todo check out ldar (like __load_acquire64) */
3609# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* very expensive on M1, but alignment advantages with LEA2 (M2?). */
3610 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3611 RTASM_ARM_DMB_SY
3612 "casa %[uDst], xzr, %[pMem]\n\t"
3613 : [uDst] "=&r" (u64)
3614 : [pMem] "Q" (*pu64),
3615 "0" (0)
3616 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3617# else
3618 __asm__ __volatile__("Lstart_ASMAtomicReadU64_%=:\n\t"
3619 RTASM_ARM_DMB_SY
3620# if defined(RT_ARCH_ARM64)
3621# if 1 /* ASSUMING proper barrier and aligned access, we should be fine with single-copy atomicity, just like on x86. */
3622 "ldur %[uDst], %[pMem]\n\t"
3623# else
3624 "ldxr %[uDst], %[pMem]\n\t"
3625 "clrex\n\t"
3626# endif
3627# else
3628 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t"
3629 /** @todo clrex */
3630# endif
3631 : [uDst] "=&r" (u64)
3632 : [pMem] "Q" (*pu64)
3633 RTASM_ARM_DMB_SY_COMMA_IN_REG);
3634# endif
3635# endif
3636
3637# else
3638# error "Port me"
3639# endif
3640 return u64;
3641}
3642#endif
3643
3644
3645/**
3646 * Atomically reads an unsigned 64-bit value, unordered.
3647 *
3648 * @returns Current *pu64 value
3649 * @param pu64 Pointer to the 64-bit variable to read.
3650 * The memory pointed to must be writable.
3651 *
3652 * @remarks This may fault if the memory is read-only!
3653 * @remarks x86: Requires a Pentium or later.
3654 */
3655#if !defined(RT_ARCH_AMD64) \
3656 && ( (RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN) \
3657 || RT_INLINE_DONT_MIX_CMPXCHG8B_AND_PIC)
3658RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_PROTO;
3659#else
3660DECLINLINE(uint64_t) ASMAtomicUoReadU64(volatile uint64_t RT_FAR *pu64) RT_NOTHROW_DEF
3661{
3662 uint64_t u64;
3663# ifdef RT_ARCH_AMD64
3664 Assert(!((uintptr_t)pu64 & 7));
3665/*# if RT_INLINE_ASM_GNU_STYLE
3666 Assert(!((uintptr_t)pu64 & 7));
3667 __asm__ __volatile__("movq %1, %0\n\t"
3668 : "=r" (u64)
3669 : "m" (*pu64));
3670# else
3671 __asm
3672 {
3673 mov rdx, [pu64]
3674 mov rax, [rdx]
3675 mov [u64], rax
3676 }
3677# endif */
3678 u64 = *pu64;
3679
3680# elif defined(RT_ARCH_X86)
3681# if RT_INLINE_ASM_GNU_STYLE
3682# if defined(PIC) || defined(__PIC__)
3683 uint32_t u32EBX = 0;
3684 uint32_t u32Spill;
3685 Assert(!((uintptr_t)pu64 & 7));
3686 __asm__ __volatile__("xor %%eax,%%eax\n\t"
3687 "xor %%ecx,%%ecx\n\t"
3688 "xor %%edx,%%edx\n\t"
3689 "xchgl %%ebx, %3\n\t"
3690 "lock; cmpxchg8b (%4)\n\t"
3691 "movl %3, %%ebx\n\t"
3692 : "=A" (u64)
3693# if RT_GNUC_PREREQ(4, 3)
3694 , "+m" (*pu64)
3695# else
3696 , "=m" (*pu64)
3697# endif
3698 , "=c" (u32Spill)
3699 : "m" (u32EBX)
3700 , "S" (pu64)
3701 : "cc");
3702# else /* !PIC */
3703 __asm__ __volatile__("lock; cmpxchg8b %1\n\t"
3704 : "=A" (u64)
3705 , "+m" (*pu64)
3706 : "0" (0ULL)
3707 , "b" (0)
3708 , "c" (0)
3709 : "cc");
3710# endif
3711# else
3712 Assert(!((uintptr_t)pu64 & 7));
3713 __asm
3714 {
3715 xor eax, eax
3716 xor edx, edx
3717 mov edi, pu64
3718 xor ecx, ecx
3719 xor ebx, ebx
3720 lock cmpxchg8b [edi]
3721 mov dword ptr [u64], eax
3722 mov dword ptr [u64 + 4], edx
3723 }
3724# endif
3725
3726# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
3727 Assert(!((uintptr_t)pu64 & 7));
3728# if RT_INLINE_ASM_USES_INTRIN
3729 u64 = (uint64_t)__iso_volatile_load64((volatile int64_t *)pu64);
3730
3731# else
3732 __asm__ __volatile__("Lstart_ASMAtomicUoReadU64_%=:\n\t"
3733# if defined(RT_ARCH_ARM64)
3734 "ldur %[uDst], %[pMem]\n\t"
3735# else
3736 "ldrexd %[uDst], %H[uDst], %[pMem]\n\t" /* this is required for atomic access since it's a pair */
3737 /** @todo clrex? */
3738# endif
3739 : [uDst] "=&r" (u64)
3740 : [pMem] "Q" (*pu64));
3741# endif
3742
3743# else
3744# error "Port me"
3745# endif
3746 return u64;
3747}
3748#endif
3749
3750
3751/**
3752 * Atomically reads a signed 64-bit value, ordered.
3753 *
3754 * @returns Current *pi64 value
3755 * @param pi64 Pointer to the 64-bit variable to read.
3756 * The memory pointed to must be writable.
3757 *
3758 * @remarks This may fault if the memory is read-only!
3759 * @remarks x86: Requires a Pentium or later.
3760 */
3761DECLINLINE(int64_t) ASMAtomicReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3762{
3763 return (int64_t)ASMAtomicReadU64((volatile uint64_t RT_FAR *)pi64);
3764}
3765
3766
3767/**
3768 * Atomically reads a signed 64-bit value, unordered.
3769 *
3770 * @returns Current *pi64 value
3771 * @param pi64 Pointer to the 64-bit variable to read.
3772 * The memory pointed to must be writable.
3773 *
3774 * @remarks This will fault if the memory is read-only!
3775 * @remarks x86: Requires a Pentium or later.
3776 */
3777DECLINLINE(int64_t) ASMAtomicUoReadS64(volatile int64_t RT_FAR *pi64) RT_NOTHROW_DEF
3778{
3779 return (int64_t)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)pi64);
3780}
3781
3782
3783/** @def RTASM_HAVE_READ_U128
3784 * Defined in the target architecture supports atomic reading of 128-bit
3785 * integers.
3786 *
3787 * The define value is zero if both ordered and unordered reads are implemented
3788 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered reads are done natively
3789 * w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
3790 *
3791 * @note AMD64: Caller must check for cmpxchg16b support before use and make
3792 * sure variables are writable (won't be changed).
3793 * @sa RTASM_HAVE_CMP_XCHG_U128, RTASM_HAVE_WRITE_U128
3794 */
3795#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
3796# define RTASM_HAVE_READ_U128 3
3797#elif defined(RTASM_HAVE_CMP_XCHG_U128)
3798# define RTASM_HAVE_READ_U128 0
3799#endif
3800
3801#ifdef RTASM_HAVE_READ_U128
3802
3803/**
3804 * Atomically reads an unsigned 128-bit value, ordered.
3805 *
3806 * @returns Current *pu128 value
3807 * @param pu128 Pointer to the 128-bit variable to read.
3808 * The memory pointed to must be writable.
3809 *
3810 * @remarks AMD64: Requires the memory to be both readable and writable.
3811 * @remarks AMD64: Requires support for cmpxchg16b.
3812 */
3813DECLINLINE(uint128_t) ASMAtomicReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3814{
3815 RTUINT128U u128Ret;
3816 Assert(!((uintptr_t)pu128 & 15));
3817# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3818 __asm__ __volatile__("Lstart_ASMAtomicReadU128_%=:\n\t"
3819 RTASM_ARM_DMB_SY
3820 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3821 RTASM_ARM_DMB_SY
3822 : [uRetHi] "=r" (u128Ret.s.Hi)
3823 , [uRetLo] "=r" (u128Ret.s.Lo)
3824 : [pMem] "Q" (*pu128)
3825 : );
3826 return u128Ret.u;
3827# else
3828 ASMAtomicCmpXchgU128v2(pu128, 0, 0, 0, 0, &u128Ret.u);
3829 return u128Ret.u;
3830# endif
3831}
3832
3833/**
3834 * Atomically reads an unsigned 128-bit value, ordered.
3835 *
3836 * @returns Current *pu128 value
3837 * @param pu128 Pointer to the 128-bit variable to read.
3838 * The memory pointed to must be writable.
3839 *
3840 * @remarks AMD64: Requires the memory to be both readable and writable.
3841 * @remarks AMD64: Requires support for cmpxchg16b.
3842 */
3843DECLINLINE(RTUINT128U) ASMAtomicReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3844{
3845 RTUINT128U u128Ret;
3846 Assert(!((uintptr_t)pu128 & 15));
3847# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3848 __asm__ __volatile__("Lstart_ASMAtomicReadU128U_%=:\n\t"
3849 RTASM_ARM_DMB_SY
3850 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3851 RTASM_ARM_DMB_SY
3852 : [uRetHi] "=r" (u128Ret.s.Hi)
3853 , [uRetLo] "=r" (u128Ret.s.Lo)
3854 : [pMem] "Q" (*pu128)
3855 : );
3856 return u128Ret;
3857# else
3858 ASMAtomicCmpXchgU128v2(&pu128->u, 0, 0, 0, 0, &u128Ret.u);
3859 return u128Ret;
3860# endif
3861}
3862
3863
3864/**
3865 * Atomically reads an unsigned 128-bit value, unordered.
3866 *
3867 * @returns Current *pu128 value
3868 * @param pu128 Pointer to the 128-bit variable to read.
3869 * The memory pointed to must be writable.
3870 *
3871 * @remarks AMD64: Requires the memory to be both readable and writable.
3872 * @remarks AMD64: Requires support for cmpxchg16b.
3873 * @remarks AMD64: Is ordered.
3874 */
3875DECLINLINE(uint128_t) ASMAtomicUoReadU128(volatile uint128_t RT_FAR *pu128) RT_NOTHROW_DEF
3876{
3877 Assert(!((uintptr_t)pu128 & 15));
3878# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3879 RTUINT128U u128Ret;
3880 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128_%=:\n\t"
3881 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3882 : [uRetHi] "=r" (u128Ret.s.Hi)
3883 , [uRetLo] "=r" (u128Ret.s.Lo)
3884 : [pMem] "Q" (*pu128)
3885 : );
3886 return u128Ret.u;
3887
3888# elif defined(RT_ARCH_AMD64) && 0
3889 /* This doesn't work because __m128i can't be made volatile and we're not
3890 able to force MSC (2019) to emit _mm_load_si128 (besides it emits movdqu
3891 instead of movdqa). */
3892 __m128i uTmpSse = _mm_load_si128((__m128i volatile *)pu128);
3893 __m128i uTmpSseHi = _mm_srli_si128(uTmpSse, 64 / 8);
3894 RTUINT128U u128Ret;
3895 u128Ret.s.Lo = (uint64_t)_mm_cvtsi128_si64(uTmpSse);
3896 u128Ret.s.Hi = (uint64_t)_mm_cvtsi128_si64(uTmpSseHi);
3897 return u128Ret.u;
3898
3899# else
3900 return ASMAtomicReadU128(pu128);
3901# endif
3902}
3903
3904/**
3905 * Atomically reads an unsigned 128-bit value, unordered.
3906 *
3907 * @returns Current *pu128 value
3908 * @param pu128 Pointer to the 128-bit variable to read.
3909 * The memory pointed to must be writable.
3910 *
3911 * @remarks AMD64: Requires the memory to be both readable and writable.
3912 * @remarks AMD64: Requires support for cmpxchg16b.
3913 * @remarks AMD64: Is ordered.
3914 */
3915DECLINLINE(RTUINT128U) ASMAtomicUoReadU128U(volatile RTUINT128U RT_FAR *pu128) RT_NOTHROW_DEF
3916{
3917 Assert(!((uintptr_t)pu128 & 15));
3918# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
3919 RTUINT128U u128Ret;
3920 __asm__ __volatile__("Lstart_ASMAtomicUoReadU128U_%=:\n\t"
3921 "ldp %[uRetLo], %[uRetHi], %[pMem]\n\t"
3922 : [uRetHi] "=r" (u128Ret.s.Hi)
3923 , [uRetLo] "=r" (u128Ret.s.Lo)
3924 : [pMem] "Q" (*pu128)
3925 : );
3926 return u128Ret;
3927# else
3928 return ASMAtomicReadU128U(pu128);
3929# endif
3930}
3931
3932#endif /* RTASM_HAVE_READ_U128 */
3933
3934/**
3935 * Atomically reads a size_t value, ordered.
3936 *
3937 * @returns Current *pcb value
3938 * @param pcb Pointer to the size_t variable to read.
3939 */
3940DECLINLINE(size_t) ASMAtomicReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3941{
3942#if ARCH_BITS == 64
3943 return ASMAtomicReadU64((uint64_t volatile RT_FAR *)pcb);
3944#elif ARCH_BITS == 32
3945 return ASMAtomicReadU32((uint32_t volatile RT_FAR *)pcb);
3946#elif ARCH_BITS == 16
3947 AssertCompileSize(size_t, 2);
3948 return ASMAtomicReadU16((uint16_t volatile RT_FAR *)pcb);
3949#else
3950# error "Unsupported ARCH_BITS value"
3951#endif
3952}
3953
3954
3955/**
3956 * Atomically reads a size_t value, unordered.
3957 *
3958 * @returns Current *pcb value
3959 * @param pcb Pointer to the size_t variable to read.
3960 */
3961DECLINLINE(size_t) ASMAtomicUoReadZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
3962{
3963#if ARCH_BITS == 64 || ARCH_BITS == 16
3964 return ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)pcb);
3965#elif ARCH_BITS == 32
3966 return ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)pcb);
3967#elif ARCH_BITS == 16
3968 AssertCompileSize(size_t, 2);
3969 return ASMAtomicUoReadU16((uint16_t volatile RT_FAR *)pcb);
3970#else
3971# error "Unsupported ARCH_BITS value"
3972#endif
3973}
3974
3975
3976/**
3977 * Atomically reads a pointer value, ordered.
3978 *
3979 * @returns Current *pv value
3980 * @param ppv Pointer to the pointer variable to read.
3981 *
3982 * @remarks Please use ASMAtomicReadPtrT, it provides better type safety and
3983 * requires less typing (no casts).
3984 */
3985DECLINLINE(void RT_FAR *) ASMAtomicReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
3986{
3987#if ARCH_BITS == 32 || ARCH_BITS == 16
3988 return (void RT_FAR *)ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
3989#elif ARCH_BITS == 64
3990 return (void RT_FAR *)ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
3991#else
3992# error "ARCH_BITS is bogus"
3993#endif
3994}
3995
3996/**
3997 * Convenience macro for avoiding the annoying casting with ASMAtomicReadPtr.
3998 *
3999 * @returns Current *pv value
4000 * @param ppv Pointer to the pointer variable to read.
4001 * @param Type The type of *ppv, sans volatile.
4002 */
4003#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4004# define ASMAtomicReadPtrT(ppv, Type) \
4005 __extension__ \
4006 ({\
4007 __typeof__(*(ppv)) volatile *ppvTypeChecked = (ppv); \
4008 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicReadPtr((void * volatile *)ppvTypeChecked); \
4009 pvTypeChecked; \
4010 })
4011#else
4012# define ASMAtomicReadPtrT(ppv, Type) \
4013 (Type)ASMAtomicReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4014#endif
4015
4016
4017/**
4018 * Atomically reads a pointer value, unordered.
4019 *
4020 * @returns Current *pv value
4021 * @param ppv Pointer to the pointer variable to read.
4022 *
4023 * @remarks Please use ASMAtomicUoReadPtrT, it provides better type safety and
4024 * requires less typing (no casts).
4025 */
4026DECLINLINE(void RT_FAR *) ASMAtomicUoReadPtr(void RT_FAR * volatile RT_FAR *ppv) RT_NOTHROW_DEF
4027{
4028#if ARCH_BITS == 32 || ARCH_BITS == 16
4029 return (void RT_FAR *)ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv);
4030#elif ARCH_BITS == 64
4031 return (void RT_FAR *)ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv);
4032#else
4033# error "ARCH_BITS is bogus"
4034#endif
4035}
4036
4037
4038/**
4039 * Convenience macro for avoiding the annoying casting with ASMAtomicUoReadPtr.
4040 *
4041 * @returns Current *pv value
4042 * @param ppv Pointer to the pointer variable to read.
4043 * @param Type The type of *ppv, sans volatile.
4044 */
4045#ifdef __GNUC__ /* 8.2.0 requires -Wno-ignored-qualifiers */
4046# define ASMAtomicUoReadPtrT(ppv, Type) \
4047 __extension__ \
4048 ({\
4049 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4050 Type pvTypeChecked = (__typeof__(*(ppv))) ASMAtomicUoReadPtr((void * volatile *)ppvTypeChecked); \
4051 pvTypeChecked; \
4052 })
4053#else
4054# define ASMAtomicUoReadPtrT(ppv, Type) \
4055 (Type)ASMAtomicUoReadPtr((void RT_FAR * volatile RT_FAR *)(ppv))
4056#endif
4057
4058
4059/**
4060 * Atomically reads a boolean value, ordered.
4061 *
4062 * @returns Current *pf value
4063 * @param pf Pointer to the boolean variable to read.
4064 */
4065DECLINLINE(bool) ASMAtomicReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4066{
4067 ASMMemoryFence();
4068 return *pf; /* byte reads are atomic on x86 */
4069}
4070
4071
4072/**
4073 * Atomically reads a boolean value, unordered.
4074 *
4075 * @returns Current *pf value
4076 * @param pf Pointer to the boolean variable to read.
4077 */
4078DECLINLINE(bool) ASMAtomicUoReadBool(volatile bool RT_FAR *pf) RT_NOTHROW_DEF
4079{
4080 return *pf; /* byte reads are atomic on x86 */
4081}
4082
4083
4084/**
4085 * Atomically read a typical IPRT handle value, ordered.
4086 *
4087 * @param ph Pointer to the handle variable to read.
4088 * @param phRes Where to store the result.
4089 *
4090 * @remarks This doesn't currently work for all handles (like RTFILE).
4091 */
4092#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4093# define ASMAtomicReadHandle(ph, phRes) \
4094 do { \
4095 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4096 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4097 *(uint32_t RT_FAR *)(phRes) = ASMAtomicReadU32((uint32_t volatile RT_FAR *)(ph)); \
4098 } while (0)
4099#elif HC_ARCH_BITS == 64
4100# define ASMAtomicReadHandle(ph, phRes) \
4101 do { \
4102 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4103 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4104 *(uint64_t RT_FAR *)(phRes) = ASMAtomicReadU64((uint64_t volatile RT_FAR *)(ph)); \
4105 } while (0)
4106#else
4107# error HC_ARCH_BITS
4108#endif
4109
4110
4111/**
4112 * Atomically read a typical IPRT handle value, unordered.
4113 *
4114 * @param ph Pointer to the handle variable to read.
4115 * @param phRes Where to store the result.
4116 *
4117 * @remarks This doesn't currently work for all handles (like RTFILE).
4118 */
4119#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4120# define ASMAtomicUoReadHandle(ph, phRes) \
4121 do { \
4122 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4123 AssertCompile(sizeof(*(phRes)) == sizeof(uint32_t)); \
4124 *(uint32_t RT_FAR *)(phRes) = ASMAtomicUoReadU32((uint32_t volatile RT_FAR *)(ph)); \
4125 } while (0)
4126#elif HC_ARCH_BITS == 64
4127# define ASMAtomicUoReadHandle(ph, phRes) \
4128 do { \
4129 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4130 AssertCompile(sizeof(*(phRes)) == sizeof(uint64_t)); \
4131 *(uint64_t RT_FAR *)(phRes) = ASMAtomicUoReadU64((uint64_t volatile RT_FAR *)(ph)); \
4132 } while (0)
4133#else
4134# error HC_ARCH_BITS
4135#endif
4136
4137
4138/**
4139 * Atomically read a value which size might differ
4140 * between platforms or compilers, ordered.
4141 *
4142 * @param pu Pointer to the variable to read.
4143 * @param puRes Where to store the result.
4144 */
4145#define ASMAtomicReadSize(pu, puRes) \
4146 do { \
4147 switch (sizeof(*(pu))) { \
4148 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4149 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4150 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4151 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4152 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4153 } \
4154 } while (0)
4155
4156
4157/**
4158 * Atomically read a value which size might differ
4159 * between platforms or compilers, unordered.
4160 *
4161 * @param pu Pointer to the variable to read.
4162 * @param puRes Where to store the result.
4163 */
4164#define ASMAtomicUoReadSize(pu, puRes) \
4165 do { \
4166 switch (sizeof(*(pu))) { \
4167 case 1: *(uint8_t RT_FAR *)(puRes) = ASMAtomicUoReadU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4168 case 2: *(uint16_t RT_FAR *)(puRes) = ASMAtomicUoReadU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4169 case 4: *(uint32_t RT_FAR *)(puRes) = ASMAtomicUoReadU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4170 case 8: *(uint64_t RT_FAR *)(puRes) = ASMAtomicUoReadU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu)); break; \
4171 default: AssertMsgFailed(("ASMAtomicReadSize: size %d is not supported\n", sizeof(*(pu)))); \
4172 } \
4173 } while (0)
4174
4175
4176/**
4177 * Atomically writes an unsigned 8-bit value, ordered.
4178 *
4179 * @param pu8 Pointer to the 8-bit variable.
4180 * @param u8 The 8-bit value to assign to *pu8.
4181 */
4182DECLINLINE(void) ASMAtomicWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4183{
4184#if defined(RT_ARCH_ARM64)
4185 /* The DMB SY will ensure ordering a la x86, the stlrb is probably overkill
4186 as all byte accesses are single-copy atomic, which I think suffices here. */
4187# if RT_INLINE_ASM_USES_INTRIN
4188 __dmb(_ARM64_BARRIER_SY);
4189 __stlr8(pu8, u8);
4190# else
4191 __asm__ __volatile__("Lstart_ASMAtomicWriteU8_%=:\n\t"
4192# if defined(RTASM_ARM64_USE_FEAT_LSE) && 0 /* this is a lot slower and has no alignment benefits with LSE2 */
4193 RTASM_ARM_DMB_SY
4194 "swpb %w[uValue], wzr, %[pMem]\n\t"
4195# else
4196 RTASM_ARM_DMB_SY
4197 "stlrb %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4198# endif
4199 : [pMem] "+Q" (*pu8)
4200 : [uValue] "r" ((uint32_t)u8)
4201 : );
4202# endif
4203
4204#else
4205 ASMAtomicXchgU8(pu8, u8);
4206#endif
4207}
4208
4209
4210/**
4211 * Atomically writes an unsigned 8-bit value, unordered.
4212 *
4213 * @param pu8 Pointer to the 8-bit variable.
4214 * @param u8 The 8-bit value to assign to *pu8.
4215 */
4216DECLINLINE(void) ASMAtomicUoWriteU8(volatile uint8_t RT_FAR *pu8, uint8_t u8) RT_NOTHROW_DEF
4217{
4218 *pu8 = u8; /* byte writes are atomic on x86 */
4219}
4220
4221
4222/**
4223 * Atomically writes a signed 8-bit value, ordered.
4224 *
4225 * @param pi8 Pointer to the 8-bit variable to read.
4226 * @param i8 The 8-bit value to assign to *pi8.
4227 */
4228DECLINLINE(void) ASMAtomicWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4229{
4230#if defined(RT_ARCH_ARM64)
4231 ASMAtomicWriteU8((volatile uint8_t RT_FAR *)pi8, (uint8_t)i8);
4232#else
4233 ASMAtomicXchgS8(pi8, i8);
4234#endif
4235}
4236
4237
4238/**
4239 * Atomically writes a signed 8-bit value, unordered.
4240 *
4241 * @param pi8 Pointer to the 8-bit variable to write.
4242 * @param i8 The 8-bit value to assign to *pi8.
4243 */
4244DECLINLINE(void) ASMAtomicUoWriteS8(volatile int8_t RT_FAR *pi8, int8_t i8) RT_NOTHROW_DEF
4245{
4246 *pi8 = i8; /* byte writes are atomic on x86 */
4247}
4248
4249
4250/**
4251 * Atomically writes an unsigned 16-bit value, ordered.
4252 *
4253 * @param pu16 Pointer to the 16-bit variable to write.
4254 * @param u16 The 16-bit value to assign to *pu16.
4255 */
4256DECLINLINE(void) ASMAtomicWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4257{
4258#if defined(RT_ARCH_ARM64)
4259 /* See ASMAtomicWriteU8 comments. */
4260# if RT_INLINE_ASM_USES_INTRIN
4261 __dmb(_ARM64_BARRIER_SY);
4262 __stlr16(pu16, u16);
4263# else
4264 __asm__ __volatile__("Lstart_ASMAtomicWriteU16_%=:\n\t"
4265# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4266 RTASM_ARM_DMB_SY
4267 "swph %w[uValue], wzr, %[pMem]\n\t"
4268# else
4269 RTASM_ARM_DMB_SY
4270 "stlrh %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4271# endif
4272 : [pMem] "+Q" (*pu16)
4273 : [uValue] "r" ((uint32_t)u16)
4274 : );
4275# endif
4276
4277#else
4278 ASMAtomicXchgU16(pu16, u16);
4279#endif
4280}
4281
4282
4283/**
4284 * Atomically writes an unsigned 16-bit value, unordered.
4285 *
4286 * @param pu16 Pointer to the 16-bit variable to write.
4287 * @param u16 The 16-bit value to assign to *pu16.
4288 */
4289DECLINLINE(void) ASMAtomicUoWriteU16(volatile uint16_t RT_FAR *pu16, uint16_t u16) RT_NOTHROW_DEF
4290{
4291 Assert(!((uintptr_t)pu16 & 1));
4292 *pu16 = u16;
4293}
4294
4295
4296/**
4297 * Atomically writes a signed 16-bit value, ordered.
4298 *
4299 * @param pi16 Pointer to the 16-bit variable to write.
4300 * @param i16 The 16-bit value to assign to *pi16.
4301 */
4302DECLINLINE(void) ASMAtomicWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4303{
4304#if defined(RT_ARCH_ARM64)
4305 ASMAtomicWriteU16((volatile uint16_t RT_FAR *)pi16, (uint16_t)i16);
4306#else
4307 ASMAtomicXchgS16(pi16, i16);
4308#endif
4309}
4310
4311
4312/**
4313 * Atomically writes a signed 16-bit value, unordered.
4314 *
4315 * @param pi16 Pointer to the 16-bit variable to write.
4316 * @param i16 The 16-bit value to assign to *pi16.
4317 */
4318DECLINLINE(void) ASMAtomicUoWriteS16(volatile int16_t RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
4319{
4320 Assert(!((uintptr_t)pi16 & 1));
4321 *pi16 = i16;
4322}
4323
4324
4325/**
4326 * Atomically writes an unsigned 32-bit value, ordered.
4327 *
4328 * @param pu32 Pointer to the 32-bit variable to write.
4329 * @param u32 The 32-bit value to assign to *pu32.
4330 */
4331DECLINLINE(void) ASMAtomicWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4332{
4333#if defined(RT_ARCH_ARM64)
4334 /* See ASMAtomicWriteU8 comments. */
4335# if RT_INLINE_ASM_USES_INTRIN
4336 __dmb(_ARM64_BARRIER_SY);
4337 __stlr32(pu32, u32);
4338# else
4339 __asm__ __volatile__("Lstart_ASMAtomicWriteU32_%=:\n\t"
4340# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4341 RTASM_ARM_DMB_SY
4342 "swp %w[uValue], wzr, %[pMem]\n\t"
4343# else
4344 RTASM_ARM_DMB_SY
4345 "stlr %w[uValue], %[pMem]\n\t" /* single-copy atomic w/ release semantics. */
4346# endif
4347 : [pMem] "+Q" (*pu32)
4348 : [uValue] "r" (u32)
4349 : "cc");
4350# endif
4351
4352#else
4353 ASMAtomicXchgU32(pu32, u32);
4354#endif
4355}
4356
4357
4358/**
4359 * Atomically writes an unsigned 32-bit value, unordered.
4360 *
4361 * @param pu32 Pointer to the 32-bit variable to write.
4362 * @param u32 The 32-bit value to assign to *pu32.
4363 */
4364DECLINLINE(void) ASMAtomicUoWriteU32(volatile uint32_t RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4365{
4366 Assert(!((uintptr_t)pu32 & 3));
4367#if ARCH_BITS >= 32
4368 *pu32 = u32;
4369#else
4370 ASMAtomicXchgU32(pu32, u32);
4371#endif
4372}
4373
4374
4375/**
4376 * Atomically writes a signed 32-bit value, ordered.
4377 *
4378 * @param pi32 Pointer to the 32-bit variable to write.
4379 * @param i32 The 32-bit value to assign to *pi32.
4380 */
4381DECLINLINE(void) ASMAtomicWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4382{
4383#if defined(RT_ARCH_ARM64)
4384 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)pi32, (uint32_t)i32);
4385#else
4386 ASMAtomicXchgS32(pi32, i32);
4387#endif
4388}
4389
4390
4391/**
4392 * Atomically writes a signed 32-bit value, unordered.
4393 *
4394 * @param pi32 Pointer to the 32-bit variable to write.
4395 * @param i32 The 32-bit value to assign to *pi32.
4396 */
4397DECLINLINE(void) ASMAtomicUoWriteS32(volatile int32_t RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
4398{
4399 Assert(!((uintptr_t)pi32 & 3));
4400#if ARCH_BITS >= 32
4401 *pi32 = i32;
4402#else
4403 ASMAtomicXchgS32(pi32, i32);
4404#endif
4405}
4406
4407
4408/**
4409 * Atomically writes an unsigned 64-bit value, ordered.
4410 *
4411 * @param pu64 Pointer to the 64-bit variable to write.
4412 * @param u64 The 64-bit value to assign to *pu64.
4413 */
4414DECLINLINE(void) ASMAtomicWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4415{
4416#if defined(RT_ARCH_ARM64)
4417 /* See ASMAtomicWriteU8 comments. */
4418# if RT_INLINE_ASM_USES_INTRIN
4419 __dmb(_ARM64_BARRIER_SY);
4420 __stlr64(pu64, u64);
4421# else
4422 __asm__ __volatile__("Lstart_ASMAtomicWriteU64_%=:\n\t"
4423# if defined(RTASM_ARM64_USE_FEAT_LSE) /* slower on M1, but benefits from relaxed LSE2 alignment requirements (M2?). */
4424 RTASM_ARM_DMB_SY
4425 "swp %[uValue], xzr, %[pMem]\n\t"
4426# else
4427 RTASM_ARM_DMB_SY /** @todo necessary? */
4428 "stlr %[uValue], %[pMem]\n\t"
4429# endif
4430 : [pMem] "+Q" (*pu64)
4431 : [uValue] "r" (u64)
4432 : );
4433# endif
4434
4435#else
4436 ASMAtomicXchgU64(pu64, u64);
4437#endif
4438}
4439
4440
4441/**
4442 * Atomically writes an unsigned 64-bit value, unordered.
4443 *
4444 * @param pu64 Pointer to the 64-bit variable to write.
4445 * @param u64 The 64-bit value to assign to *pu64.
4446 */
4447DECLINLINE(void) ASMAtomicUoWriteU64(volatile uint64_t RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
4448{
4449 Assert(!((uintptr_t)pu64 & 7));
4450#if ARCH_BITS == 64
4451 *pu64 = u64;
4452#else
4453 ASMAtomicXchgU64(pu64, u64);
4454#endif
4455}
4456
4457
4458/**
4459 * Atomically writes a signed 64-bit value, ordered.
4460 *
4461 * @param pi64 Pointer to the 64-bit variable to write.
4462 * @param i64 The 64-bit value to assign to *pi64.
4463 */
4464DECLINLINE(void) ASMAtomicWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4465{
4466#if defined(RT_ARCH_ARM64)
4467 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)pi64, (uint64_t)i64);
4468#else
4469 ASMAtomicXchgS64(pi64, i64);
4470#endif
4471}
4472
4473
4474/**
4475 * Atomically writes a signed 64-bit value, unordered.
4476 *
4477 * @param pi64 Pointer to the 64-bit variable to write.
4478 * @param i64 The 64-bit value to assign to *pi64.
4479 */
4480DECLINLINE(void) ASMAtomicUoWriteS64(volatile int64_t RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
4481{
4482 Assert(!((uintptr_t)pi64 & 7));
4483#if ARCH_BITS == 64
4484 *pi64 = i64;
4485#else
4486 ASMAtomicXchgS64(pi64, i64);
4487#endif
4488}
4489
4490
4491/** @def RTASM_HAVE_WRITE_U128
4492 * Defined in the target architecture supports atomic of 128-bit integers.
4493 *
4494 * The define value is zero if both ordered and unordered writes are implemented
4495 * using ASMAtomicCmpXchgU128v2(). It is 1 if unordered writes are done
4496 * natively w/o cmpxchg and 3 if both variants are done natively w/o cmpxchg.
4497 *
4498 * @note AMD64: Caller must check for cmpxchg16b support before use.
4499 * @sa RTASM_HAVE_CMP_XCHG_U128
4500 */
4501#if defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
4502# define RTASM_HAVE_WRITE_U128 3
4503#elif defined(RTASM_HAVE_CMP_XCHG_U128)
4504# define RTASM_HAVE_WRITE_U128 0
4505#endif
4506
4507#ifdef RTASM_HAVE_WRITE_U128
4508
4509/**
4510 * Atomically writes an unsigned 128-bit value, ordered.
4511 *
4512 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4513 * on 16 byte boundrary.
4514 * @param u64Hi The high 64 bits of the new value.
4515 * @param u64Lo The low 64 bits of the new value.
4516 */
4517DECLINLINE(void) ASMAtomicWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4518{
4519# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4520 RTUINT128U u128Old;
4521# endif
4522 Assert(!((uintptr_t)pu128 & 15));
4523# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4524 __asm__ __volatile__("Lstart_ASMAtomicWriteU128v2_%=:\n\t"
4525# if 0 && defined(RTASM_ARM64_USE_FEAT_LSE128) /** @todo hw support? test + debug */
4526 RTASM_ARM_DMB_SY
4527 "swpp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4528# else
4529 RTASM_ARM_DMB_SY
4530 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4531 "dmb sy\n\t"
4532# endif
4533 : [pMem] "+Q" (*pu128)
4534 : [uValueHi] "r" (u64Hi)
4535 , [uValueLo] "r" (u64Lo)
4536 : );
4537
4538# else
4539# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4540 u128Old.u = *pu128;
4541# else
4542 u128Old.u.Lo = pu128->Lo;
4543 u128Old.u.Hi = pu128->Hi;
4544# endif
4545 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4546 { }
4547# endif
4548}
4549
4550
4551/**
4552 * Atomically writes an unsigned 128-bit value, ordered.
4553 *
4554 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4555 * on 16 byte boundrary.
4556 * @param u64Hi The high 64 bits of the new value.
4557 * @param u64Lo The low 64 bits of the new value.
4558 * @note This is ordered on AMD64.
4559 */
4560DECLINLINE(void) ASMAtomicUoWriteU128v2(volatile uint128_t *pu128, const uint64_t u64Hi, const uint64_t u64Lo) RT_NOTHROW_DEF
4561{
4562# if !defined(__GNUC__) || !defined(RT_ARCH_ARM64)
4563 RTUINT128U u128Old;
4564# endif
4565 Assert(!((uintptr_t)pu128 & 15));
4566# if defined(__GNUC__) && defined(RT_ARCH_ARM64)
4567 __asm__ __volatile__("Lstart_ASMAtomicUoWriteU128v2_%=:\n\t"
4568 "stp %[uValueLo], %[uValueHi], %[pMem]\n\t"
4569 : [pMem] "+Q" (*pu128)
4570 : [uValueHi] "r" (u64Hi)
4571 , [uValueLo] "r" (u64Lo)
4572 : );
4573
4574# else
4575# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4576 u128Old.u = *pu128;
4577# else
4578 u128Old.u.Lo = pu128->Lo;
4579 u128Old.u.Hi = pu128->Hi;
4580# endif
4581 while (!ASMAtomicCmpXchgU128v2(pu128, u64Hi, u64Lo, u128Old.s.Hi, u128Old.s.Lo, &u128Old.u))
4582 { }
4583# endif
4584}
4585
4586
4587/**
4588 * Atomically writes an unsigned 128-bit value, ordered.
4589 *
4590 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4591 * on 16 byte boundrary.
4592 * @param u128 The the new value.
4593 */
4594DECLINLINE(void) ASMAtomicWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4595{
4596# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4597 ASMAtomicWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4598# else
4599 ASMAtomicWriteU128v2(pu128, u128.Hi, u128.Lo);
4600# endif
4601}
4602
4603
4604/**
4605 * Atomically writes an unsigned 128-bit value, unordered.
4606 *
4607 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4608 * on 16 byte boundrary.
4609 * @param u128 The the new value.
4610 * @note This is ordered on AMD64.
4611 */
4612DECLINLINE(void) ASMAtomicUoWriteU128(volatile uint128_t *pu128, const uint128_t u128) RT_NOTHROW_DEF
4613{
4614# ifdef RT_COMPILER_WITH_128BIT_INT_TYPES
4615 ASMAtomicUoWriteU128v2(pu128, (uint64_t)(u128 >> 64), (uint64_t)u128);
4616# else
4617 ASMAtomicUoWriteU128v2(pu128, u128.Hi, u128.Lo);
4618# endif
4619}
4620
4621
4622/**
4623 * Atomically writes an unsigned 128-bit value, ordered.
4624 *
4625 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4626 * on 16 byte boundrary.
4627 * @param u128 The the new value.
4628 */
4629DECLINLINE(void) ASMAtomicWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4630{
4631 ASMAtomicWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4632}
4633
4634
4635/**
4636 * Atomically writes an unsigned 128-bit value, unordered.
4637 *
4638 * @param pu128 Pointer to the variable to overwrite. Must be aligned
4639 * on 16 byte boundrary.
4640 * @param u128 The the new value.
4641 * @note This is ordered on AMD64.
4642 */
4643DECLINLINE(void) ASMAtomicUoWriteU128U(volatile RTUINT128U *pu128, const RTUINT128U u128) RT_NOTHROW_DEF
4644{
4645 ASMAtomicUoWriteU128v2(&pu128->u, u128.s.Hi, u128.s.Lo);
4646}
4647
4648#endif /* RTASM_HAVE_WRITE_U128 */
4649
4650/**
4651 * Atomically writes a size_t value, ordered.
4652 *
4653 * @param pcb Pointer to the size_t variable to write.
4654 * @param cb The value to assign to *pcb.
4655 */
4656DECLINLINE(void) ASMAtomicWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4657{
4658#if ARCH_BITS == 64
4659 ASMAtomicWriteU64((uint64_t volatile *)pcb, cb);
4660#elif ARCH_BITS == 32
4661 ASMAtomicWriteU32((uint32_t volatile *)pcb, cb);
4662#elif ARCH_BITS == 16
4663 AssertCompileSize(size_t, 2);
4664 ASMAtomicWriteU16((uint16_t volatile *)pcb, cb);
4665#else
4666# error "Unsupported ARCH_BITS value"
4667#endif
4668}
4669
4670
4671/**
4672 * Atomically writes a size_t value, unordered.
4673 *
4674 * @param pcb Pointer to the size_t variable to write.
4675 * @param cb The value to assign to *pcb.
4676 */
4677DECLINLINE(void) ASMAtomicUoWriteZ(volatile size_t RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
4678{
4679#if ARCH_BITS == 64
4680 ASMAtomicUoWriteU64((uint64_t volatile *)pcb, cb);
4681#elif ARCH_BITS == 32
4682 ASMAtomicUoWriteU32((uint32_t volatile *)pcb, cb);
4683#elif ARCH_BITS == 16
4684 AssertCompileSize(size_t, 2);
4685 ASMAtomicUoWriteU16((uint16_t volatile *)pcb, cb);
4686#else
4687# error "Unsupported ARCH_BITS value"
4688#endif
4689}
4690
4691
4692/**
4693 * Atomically writes a boolean value, unordered.
4694 *
4695 * @param pf Pointer to the boolean variable to write.
4696 * @param f The boolean value to assign to *pf.
4697 */
4698DECLINLINE(void) ASMAtomicWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4699{
4700 ASMAtomicWriteU8((uint8_t volatile RT_FAR *)pf, f);
4701}
4702
4703
4704/**
4705 * Atomically writes a boolean value, unordered.
4706 *
4707 * @param pf Pointer to the boolean variable to write.
4708 * @param f The boolean value to assign to *pf.
4709 */
4710DECLINLINE(void) ASMAtomicUoWriteBool(volatile bool RT_FAR *pf, bool f) RT_NOTHROW_DEF
4711{
4712 *pf = f; /* byte writes are atomic on x86 */
4713}
4714
4715
4716/**
4717 * Atomically writes a pointer value, ordered.
4718 *
4719 * @param ppv Pointer to the pointer variable to write.
4720 * @param pv The pointer value to assign to *ppv.
4721 */
4722DECLINLINE(void) ASMAtomicWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4723{
4724#if ARCH_BITS == 32 || ARCH_BITS == 16
4725 ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4726#elif ARCH_BITS == 64
4727 ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4728#else
4729# error "ARCH_BITS is bogus"
4730#endif
4731}
4732
4733
4734/**
4735 * Atomically writes a pointer value, unordered.
4736 *
4737 * @param ppv Pointer to the pointer variable to write.
4738 * @param pv The pointer value to assign to *ppv.
4739 */
4740DECLINLINE(void) ASMAtomicUoWritePtrVoid(void RT_FAR * volatile RT_FAR *ppv, const void *pv) RT_NOTHROW_DEF
4741{
4742#if ARCH_BITS == 32 || ARCH_BITS == 16
4743 ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)ppv, (uint32_t)pv);
4744#elif ARCH_BITS == 64
4745 ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)ppv, (uint64_t)pv);
4746#else
4747# error "ARCH_BITS is bogus"
4748#endif
4749}
4750
4751
4752/**
4753 * Atomically writes a pointer value, ordered.
4754 *
4755 * @param ppv Pointer to the pointer variable to write.
4756 * @param pv The pointer value to assign to *ppv. If NULL use
4757 * ASMAtomicWriteNullPtr or you'll land in trouble.
4758 *
4759 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4760 * NULL.
4761 */
4762#ifdef __GNUC__
4763# define ASMAtomicWritePtr(ppv, pv) \
4764 do \
4765 { \
4766 __typeof__(*(ppv)) volatile RT_FAR * const ppvTypeChecked = (ppv); \
4767 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4768 \
4769 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4770 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4771 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4772 \
4773 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), (void RT_FAR *)(pvTypeChecked)); \
4774 } while (0)
4775#else
4776# define ASMAtomicWritePtr(ppv, pv) \
4777 do \
4778 { \
4779 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4780 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4781 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4782 \
4783 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), (void RT_FAR *)(pv)); \
4784 } while (0)
4785#endif
4786
4787
4788/**
4789 * Atomically sets a pointer to NULL, ordered.
4790 *
4791 * @param ppv Pointer to the pointer variable that should be set to NULL.
4792 *
4793 * @remarks This is relatively type safe on GCC platforms.
4794 */
4795#if RT_GNUC_PREREQ(4, 2)
4796# define ASMAtomicWriteNullPtr(ppv) \
4797 do \
4798 { \
4799 __typeof__(*(ppv)) * const ppvTypeChecked = (ppv); \
4800 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4801 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4802 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppvTypeChecked), NULL); \
4803 } while (0)
4804#else
4805# define ASMAtomicWriteNullPtr(ppv) \
4806 do \
4807 { \
4808 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4809 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4810 ASMAtomicWritePtrVoid((void RT_FAR * volatile RT_FAR *)(ppv), NULL); \
4811 } while (0)
4812#endif
4813
4814
4815/**
4816 * Atomically writes a pointer value, unordered.
4817 *
4818 * @returns Current *pv value
4819 * @param ppv Pointer to the pointer variable.
4820 * @param pv The pointer value to assign to *ppv. If NULL use
4821 * ASMAtomicUoWriteNullPtr or you'll land in trouble.
4822 *
4823 * @remarks This is relatively type safe on GCC platforms when @a pv isn't
4824 * NULL.
4825 */
4826#if RT_GNUC_PREREQ(4, 2)
4827# define ASMAtomicUoWritePtr(ppv, pv) \
4828 do \
4829 { \
4830 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4831 __typeof__(*(ppv)) const pvTypeChecked = (pv); \
4832 \
4833 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4834 AssertCompile(sizeof(pv) == sizeof(void *)); \
4835 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4836 \
4837 *(ppvTypeChecked) = pvTypeChecked; \
4838 } while (0)
4839#else
4840# define ASMAtomicUoWritePtr(ppv, pv) \
4841 do \
4842 { \
4843 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4844 AssertCompile(sizeof(pv) == sizeof(void RT_FAR *)); \
4845 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4846 *(ppv) = pv; \
4847 } while (0)
4848#endif
4849
4850
4851/**
4852 * Atomically sets a pointer to NULL, unordered.
4853 *
4854 * @param ppv Pointer to the pointer variable that should be set to NULL.
4855 *
4856 * @remarks This is relatively type safe on GCC platforms.
4857 */
4858#ifdef __GNUC__
4859# define ASMAtomicUoWriteNullPtr(ppv) \
4860 do \
4861 { \
4862 __typeof__(*(ppv)) volatile * const ppvTypeChecked = (ppv); \
4863 AssertCompile(sizeof(*ppv) == sizeof(void *)); \
4864 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4865 *(ppvTypeChecked) = NULL; \
4866 } while (0)
4867#else
4868# define ASMAtomicUoWriteNullPtr(ppv) \
4869 do \
4870 { \
4871 AssertCompile(sizeof(*ppv) == sizeof(void RT_FAR *)); \
4872 Assert(!( (uintptr_t)ppv & ((ARCH_BITS / 8) - 1) )); \
4873 *(ppv) = NULL; \
4874 } while (0)
4875#endif
4876
4877
4878/**
4879 * Atomically write a typical IPRT handle value, ordered.
4880 *
4881 * @param ph Pointer to the variable to update.
4882 * @param hNew The value to assign to *ph.
4883 *
4884 * @remarks This doesn't currently work for all handles (like RTFILE).
4885 */
4886#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4887# define ASMAtomicWriteHandle(ph, hNew) \
4888 do { \
4889 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4890 ASMAtomicWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)(hNew)); \
4891 } while (0)
4892#elif HC_ARCH_BITS == 64
4893# define ASMAtomicWriteHandle(ph, hNew) \
4894 do { \
4895 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4896 ASMAtomicWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)(hNew)); \
4897 } while (0)
4898#else
4899# error HC_ARCH_BITS
4900#endif
4901
4902
4903/**
4904 * Atomically write a typical IPRT handle value, unordered.
4905 *
4906 * @param ph Pointer to the variable to update.
4907 * @param hNew The value to assign to *ph.
4908 *
4909 * @remarks This doesn't currently work for all handles (like RTFILE).
4910 */
4911#if HC_ARCH_BITS == 32 || ARCH_BITS == 16
4912# define ASMAtomicUoWriteHandle(ph, hNew) \
4913 do { \
4914 AssertCompile(sizeof(*(ph)) == sizeof(uint32_t)); \
4915 ASMAtomicUoWriteU32((uint32_t volatile RT_FAR *)(ph), (const uint32_t)hNew); \
4916 } while (0)
4917#elif HC_ARCH_BITS == 64
4918# define ASMAtomicUoWriteHandle(ph, hNew) \
4919 do { \
4920 AssertCompile(sizeof(*(ph)) == sizeof(uint64_t)); \
4921 ASMAtomicUoWriteU64((uint64_t volatile RT_FAR *)(ph), (const uint64_t)hNew); \
4922 } while (0)
4923#else
4924# error HC_ARCH_BITS
4925#endif
4926
4927
4928/**
4929 * Atomically write a value which size might differ
4930 * between platforms or compilers, ordered.
4931 *
4932 * @param pu Pointer to the variable to update.
4933 * @param uNew The value to assign to *pu.
4934 */
4935#define ASMAtomicWriteSize(pu, uNew) \
4936 do { \
4937 switch (sizeof(*(pu))) { \
4938 case 1: ASMAtomicWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4939 case 2: ASMAtomicWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4940 case 4: ASMAtomicWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4941 case 8: ASMAtomicWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4942 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4943 } \
4944 } while (0)
4945
4946/**
4947 * Atomically write a value which size might differ
4948 * between platforms or compilers, unordered.
4949 *
4950 * @param pu Pointer to the variable to update.
4951 * @param uNew The value to assign to *pu.
4952 */
4953#define ASMAtomicUoWriteSize(pu, uNew) \
4954 do { \
4955 switch (sizeof(*(pu))) { \
4956 case 1: ASMAtomicUoWriteU8( (volatile uint8_t RT_FAR *)(void RT_FAR *)(pu), (uint8_t )(uNew)); break; \
4957 case 2: ASMAtomicUoWriteU16((volatile uint16_t RT_FAR *)(void RT_FAR *)(pu), (uint16_t)(uNew)); break; \
4958 case 4: ASMAtomicUoWriteU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
4959 case 8: ASMAtomicUoWriteU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
4960 default: AssertMsgFailed(("ASMAtomicWriteSize: size %d is not supported\n", sizeof(*(pu)))); \
4961 } \
4962 } while (0)
4963
4964
4965
4966/**
4967 * Atomically exchanges and adds to a 16-bit value, ordered.
4968 *
4969 * @returns The old value.
4970 * @param pu16 Pointer to the value.
4971 * @param u16 Number to add.
4972 *
4973 * @remarks Currently not implemented, just to make 16-bit code happy.
4974 * @remarks x86: Requires a 486 or later.
4975 */
4976RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicAddU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_PROTO;
4977
4978
4979/**
4980 * Atomically exchanges and adds to a 32-bit value, ordered.
4981 *
4982 * @returns The old value.
4983 * @param pu32 Pointer to the value.
4984 * @param u32 Number to add.
4985 *
4986 * @remarks x86: Requires a 486 or later.
4987 */
4988#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
4989RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
4990#else
4991DECLINLINE(uint32_t) ASMAtomicAddU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
4992{
4993# if RT_INLINE_ASM_USES_INTRIN
4994 u32 = _InterlockedExchangeAdd((long RT_FAR *)pu32, u32);
4995 return u32;
4996
4997# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
4998# if RT_INLINE_ASM_GNU_STYLE
4999 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5000 : "=r" (u32)
5001 , "=m" (*pu32)
5002 : "0" (u32)
5003 , "m" (*pu32)
5004 : "memory"
5005 , "cc");
5006 return u32;
5007# else
5008 __asm
5009 {
5010 mov eax, [u32]
5011# ifdef RT_ARCH_AMD64
5012 mov rdx, [pu32]
5013 lock xadd [rdx], eax
5014# else
5015 mov edx, [pu32]
5016 lock xadd [edx], eax
5017# endif
5018 mov [u32], eax
5019 }
5020 return u32;
5021# endif
5022
5023# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5024 /* M1 benchmark: ldaddal=6907 vs dmb+ldadd=2114 vs non-lse=6249 (ps/call) */
5025# if defined(RTASM_ARM64_USE_FEAT_LSE)
5026 uint32_t u32OldRet;
5027 __asm__ __volatile__("Lstart_ASMAtomicAddU32_%=:\n\t"
5028# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5029 "ldaddal %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5030# else
5031 RTASM_ARM_DMB_SY
5032 "ldadd %w[uAddend], %w[uOldActual], %[pMem]\n\t"
5033# endif
5034 : [pMem] "+Q" (*pu32)
5035 , [uOldActual] "=&r" (u32OldRet)
5036 : [uAddend] "r" (u32)
5037 : );
5038# else
5039 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAddU32, pu32, DMB_SY,
5040 "add %w[uNew], %w[uOld], %w[uVal]\n\t",
5041 "add %[uNew], %[uOld], %[uVal]\n\t",
5042 [uVal] "r" (u32));
5043# endif
5044 return u32OldRet;
5045
5046# else
5047# error "Port me"
5048# endif
5049}
5050#endif
5051
5052
5053/**
5054 * Atomically exchanges and adds to a signed 32-bit value, ordered.
5055 *
5056 * @returns The old value.
5057 * @param pi32 Pointer to the value.
5058 * @param i32 Number to add.
5059 *
5060 * @remarks x86: Requires a 486 or later.
5061 */
5062DECLINLINE(int32_t) ASMAtomicAddS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5063{
5064 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5065}
5066
5067
5068/**
5069 * Atomically exchanges and adds to a 64-bit value, ordered.
5070 *
5071 * @returns The old value.
5072 * @param pu64 Pointer to the value.
5073 * @param u64 Number to add.
5074 *
5075 * @remarks x86: Requires a Pentium or later.
5076 */
5077#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5078DECLASM(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5079#else
5080DECLINLINE(uint64_t) ASMAtomicAddU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5081{
5082# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5083 u64 = _InterlockedExchangeAdd64((__int64 RT_FAR *)pu64, u64);
5084 return u64;
5085
5086# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5087 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5088 : "=r" (u64)
5089 , "=m" (*pu64)
5090 : "0" (u64)
5091 , "m" (*pu64)
5092 : "memory"
5093 , "cc");
5094 return u64;
5095
5096# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5097# if defined(RTASM_ARM64_USE_FEAT_LSE)
5098 uint64_t u64OldRet;
5099 __asm__ __volatile__("Lstart_ASMAtomicAddU64_%=:\n\t"
5100# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5101 "ldaddal %[uAddend], %[uOldActual], %[pMem]\n\t"
5102# else
5103 RTASM_ARM_DMB_SY
5104 "ldadd %[uAddend], %[uOldActual], %[pMem]\n\t"
5105# endif
5106 : [pMem] "+Q" (*pu64)
5107 , [uOldActual] "=&r" (u64OldRet)
5108 : [uAddend] "r" (u64)
5109 : );
5110# else
5111 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_64(ASMAtomicAddU64, pu64, DMB_SY,
5112 "add %[uNew], %[uOld], %[uVal]\n\t"
5113 ,
5114 "add %[uNew], %[uOld], %[uVal]\n\t"
5115 "adc %H[uNew], %H[uOld], %H[uVal]\n\t",
5116 [uVal] "r" (u64));
5117# endif
5118 return u64OldRet;
5119
5120# else
5121 uint64_t u64Old;
5122 for (;;)
5123 {
5124 uint64_t u64New;
5125 u64Old = ASMAtomicUoReadU64(pu64);
5126 u64New = u64Old + u64;
5127 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5128 break;
5129 ASMNopPause();
5130 }
5131 return u64Old;
5132# endif
5133}
5134#endif
5135
5136
5137/**
5138 * Atomically exchanges and adds to a signed 64-bit value, ordered.
5139 *
5140 * @returns The old value.
5141 * @param pi64 Pointer to the value.
5142 * @param i64 Number to add.
5143 *
5144 * @remarks x86: Requires a Pentium or later.
5145 */
5146DECLINLINE(int64_t) ASMAtomicAddS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5147{
5148 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5149}
5150
5151
5152/**
5153 * Atomically exchanges and adds to a size_t value, ordered.
5154 *
5155 * @returns The old value.
5156 * @param pcb Pointer to the size_t value.
5157 * @param cb Number to add.
5158 */
5159DECLINLINE(size_t) ASMAtomicAddZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5160{
5161#if ARCH_BITS == 64
5162 AssertCompileSize(size_t, 8);
5163 return ASMAtomicAddU64((uint64_t volatile RT_FAR *)pcb, cb);
5164#elif ARCH_BITS == 32
5165 AssertCompileSize(size_t, 4);
5166 return ASMAtomicAddU32((uint32_t volatile RT_FAR *)pcb, cb);
5167#elif ARCH_BITS == 16
5168 AssertCompileSize(size_t, 2);
5169 return ASMAtomicAddU16((uint16_t volatile RT_FAR *)pcb, cb);
5170#else
5171# error "Unsupported ARCH_BITS value"
5172#endif
5173}
5174
5175
5176/**
5177 * Atomically exchanges and adds a value which size might differ between
5178 * platforms or compilers, ordered.
5179 *
5180 * @param pu Pointer to the variable to update.
5181 * @param uNew The value to add to *pu.
5182 * @param puOld Where to store the old value.
5183 */
5184#define ASMAtomicAddSize(pu, uNew, puOld) \
5185 do { \
5186 switch (sizeof(*(pu))) { \
5187 case 4: *(uint32_t *)(puOld) = ASMAtomicAddU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5188 case 8: *(uint64_t *)(puOld) = ASMAtomicAddU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5189 default: AssertMsgFailed(("ASMAtomicAddSize: size %d is not supported\n", sizeof(*(pu)))); \
5190 } \
5191 } while (0)
5192
5193
5194
5195/**
5196 * Atomically exchanges and subtracts to an unsigned 16-bit value, ordered.
5197 *
5198 * @returns The old value.
5199 * @param pu16 Pointer to the value.
5200 * @param u16 Number to subtract.
5201 *
5202 * @remarks x86: Requires a 486 or later.
5203 */
5204DECLINLINE(uint16_t) ASMAtomicSubU16(uint16_t volatile RT_FAR *pu16, uint32_t u16) RT_NOTHROW_DEF
5205{
5206 return ASMAtomicAddU16(pu16, (uint16_t)-(int16_t)u16);
5207}
5208
5209
5210/**
5211 * Atomically exchanges and subtracts to a signed 16-bit value, ordered.
5212 *
5213 * @returns The old value.
5214 * @param pi16 Pointer to the value.
5215 * @param i16 Number to subtract.
5216 *
5217 * @remarks x86: Requires a 486 or later.
5218 */
5219DECLINLINE(int16_t) ASMAtomicSubS16(int16_t volatile RT_FAR *pi16, int16_t i16) RT_NOTHROW_DEF
5220{
5221 return (int16_t)ASMAtomicAddU16((uint16_t volatile RT_FAR *)pi16, (uint16_t)-i16);
5222}
5223
5224
5225/**
5226 * Atomically exchanges and subtracts to an unsigned 32-bit value, ordered.
5227 *
5228 * @returns The old value.
5229 * @param pu32 Pointer to the value.
5230 * @param u32 Number to subtract.
5231 *
5232 * @remarks x86: Requires a 486 or later.
5233 */
5234DECLINLINE(uint32_t) ASMAtomicSubU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5235{
5236 return ASMAtomicAddU32(pu32, (uint32_t)-(int32_t)u32);
5237}
5238
5239
5240/**
5241 * Atomically exchanges and subtracts to a signed 32-bit value, ordered.
5242 *
5243 * @returns The old value.
5244 * @param pi32 Pointer to the value.
5245 * @param i32 Number to subtract.
5246 *
5247 * @remarks x86: Requires a 486 or later.
5248 */
5249DECLINLINE(int32_t) ASMAtomicSubS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5250{
5251 return (int32_t)ASMAtomicAddU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)-i32);
5252}
5253
5254
5255/**
5256 * Atomically exchanges and subtracts to an unsigned 64-bit value, ordered.
5257 *
5258 * @returns The old value.
5259 * @param pu64 Pointer to the value.
5260 * @param u64 Number to subtract.
5261 *
5262 * @remarks x86: Requires a Pentium or later.
5263 */
5264DECLINLINE(uint64_t) ASMAtomicSubU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5265{
5266 return ASMAtomicAddU64(pu64, (uint64_t)-(int64_t)u64);
5267}
5268
5269
5270/**
5271 * Atomically exchanges and subtracts to a signed 64-bit value, ordered.
5272 *
5273 * @returns The old value.
5274 * @param pi64 Pointer to the value.
5275 * @param i64 Number to subtract.
5276 *
5277 * @remarks x86: Requires a Pentium or later.
5278 */
5279DECLINLINE(int64_t) ASMAtomicSubS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5280{
5281 return (int64_t)ASMAtomicAddU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)-i64);
5282}
5283
5284
5285/**
5286 * Atomically exchanges and subtracts to a size_t value, ordered.
5287 *
5288 * @returns The old value.
5289 * @param pcb Pointer to the size_t value.
5290 * @param cb Number to subtract.
5291 *
5292 * @remarks x86: Requires a 486 or later.
5293 */
5294DECLINLINE(size_t) ASMAtomicSubZ(size_t volatile RT_FAR *pcb, size_t cb) RT_NOTHROW_DEF
5295{
5296#if ARCH_BITS == 64
5297 return ASMAtomicSubU64((uint64_t volatile RT_FAR *)pcb, cb);
5298#elif ARCH_BITS == 32
5299 return ASMAtomicSubU32((uint32_t volatile RT_FAR *)pcb, cb);
5300#elif ARCH_BITS == 16
5301 AssertCompileSize(size_t, 2);
5302 return ASMAtomicSubU16((uint16_t volatile RT_FAR *)pcb, cb);
5303#else
5304# error "Unsupported ARCH_BITS value"
5305#endif
5306}
5307
5308
5309/**
5310 * Atomically exchanges and subtracts a value which size might differ between
5311 * platforms or compilers, ordered.
5312 *
5313 * @param pu Pointer to the variable to update.
5314 * @param uNew The value to subtract to *pu.
5315 * @param puOld Where to store the old value.
5316 *
5317 * @remarks x86: Requires a 486 or later.
5318 */
5319#define ASMAtomicSubSize(pu, uNew, puOld) \
5320 do { \
5321 switch (sizeof(*(pu))) { \
5322 case 4: *(uint32_t RT_FAR *)(puOld) = ASMAtomicSubU32((volatile uint32_t RT_FAR *)(void RT_FAR *)(pu), (uint32_t)(uNew)); break; \
5323 case 8: *(uint64_t RT_FAR *)(puOld) = ASMAtomicSubU64((volatile uint64_t RT_FAR *)(void RT_FAR *)(pu), (uint64_t)(uNew)); break; \
5324 default: AssertMsgFailed(("ASMAtomicSubSize: size %d is not supported\n", sizeof(*(pu)))); \
5325 } \
5326 } while (0)
5327
5328
5329
5330/**
5331 * Atomically increment a 16-bit value, ordered.
5332 *
5333 * @returns The new value.
5334 * @param pu16 Pointer to the value to increment.
5335 * @remarks Not implemented. Just to make 16-bit code happy.
5336 *
5337 * @remarks x86: Requires a 486 or later.
5338 */
5339RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMAtomicIncU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5340
5341
5342/**
5343 * Atomically increment a 32-bit value, ordered.
5344 *
5345 * @returns The new value.
5346 * @param pu32 Pointer to the value to increment.
5347 *
5348 * @remarks x86: Requires a 486 or later.
5349 */
5350#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5351RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5352#else
5353DECLINLINE(uint32_t) ASMAtomicIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5354{
5355# if RT_INLINE_ASM_USES_INTRIN
5356 return (uint32_t)_InterlockedIncrement((long RT_FAR *)pu32);
5357
5358# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5359# if RT_INLINE_ASM_GNU_STYLE
5360 uint32_t u32;
5361 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5362 : "=r" (u32)
5363 , "=m" (*pu32)
5364 : "0" (1)
5365 , "m" (*pu32)
5366 : "memory"
5367 , "cc");
5368 return u32+1;
5369# else
5370 __asm
5371 {
5372 mov eax, 1
5373# ifdef RT_ARCH_AMD64
5374 mov rdx, [pu32]
5375 lock xadd [rdx], eax
5376# else
5377 mov edx, [pu32]
5378 lock xadd [edx], eax
5379# endif
5380 mov u32, eax
5381 }
5382 return u32+1;
5383# endif
5384
5385# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5386 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2117 vs non-lse=6247 (ps/call) */
5387# if defined(RTASM_ARM64_USE_FEAT_LSE)
5388 uint32_t u32NewRet;
5389 __asm__ __volatile__("Lstart_ASMAtomicIncU32_%=:\n\t"
5390# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5391 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5392# else
5393 RTASM_ARM_DMB_SY
5394 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5395# endif
5396 "add %w[uNewRet], %w[uNewRet], #1\n\t"
5397 : [pMem] "+Q" (*pu32)
5398 , [uNewRet] "=&r" (u32NewRet)
5399 : [uAddend] "r" ((uint32_t)1)
5400 : );
5401# else
5402 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicIncU32, pu32, DMB_SY,
5403 "add %w[uNew], %w[uNew], #1\n\t",
5404 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5405 "X" (0) /* dummy */);
5406# endif
5407 return u32NewRet;
5408
5409# else
5410 return ASMAtomicAddU32(pu32, 1) + 1;
5411# endif
5412}
5413#endif
5414
5415
5416/**
5417 * Atomically increment a signed 32-bit value, ordered.
5418 *
5419 * @returns The new value.
5420 * @param pi32 Pointer to the value to increment.
5421 *
5422 * @remarks x86: Requires a 486 or later.
5423 */
5424DECLINLINE(int32_t) ASMAtomicIncS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5425{
5426 return (int32_t)ASMAtomicIncU32((uint32_t volatile RT_FAR *)pi32);
5427}
5428
5429
5430/**
5431 * Atomically increment a 64-bit value, ordered.
5432 *
5433 * @returns The new value.
5434 * @param pu64 Pointer to the value to increment.
5435 *
5436 * @remarks x86: Requires a Pentium or later.
5437 */
5438#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5439DECLASM(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5440#else
5441DECLINLINE(uint64_t) ASMAtomicIncU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5442{
5443# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5444 return (uint64_t)_InterlockedIncrement64((__int64 RT_FAR *)pu64);
5445
5446# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5447 uint64_t u64;
5448 __asm__ __volatile__("lock; xaddq %0, %1\n\t"
5449 : "=r" (u64)
5450 , "=m" (*pu64)
5451 : "0" (1)
5452 , "m" (*pu64)
5453 : "memory"
5454 , "cc");
5455 return u64 + 1;
5456
5457# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5458# if defined(RTASM_ARM64_USE_FEAT_LSE)
5459 uint64_t u64NewRet;
5460 __asm__ __volatile__("Lstart_ASMAtomicIncU64_%=:\n\t"
5461# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5462 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5463# else
5464 RTASM_ARM_DMB_SY
5465 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5466# endif
5467 "add %[uNewRet], %[uNewRet], #1\n\t"
5468 : [pMem] "+Q" (*pu64)
5469 , [uNewRet] "=&r" (u64NewRet)
5470 : [uAddend] "r" ((uint64_t)1)
5471 : );
5472# else
5473 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicIncU64, pu64, DMB_SY,
5474 "add %[uNew], %[uNew], #1\n\t"
5475 ,
5476 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5477 "adc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5478 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5479# endif
5480 return u64NewRet;
5481
5482# else
5483 return ASMAtomicAddU64(pu64, 1) + 1;
5484# endif
5485}
5486#endif
5487
5488
5489/**
5490 * Atomically increment a signed 64-bit value, ordered.
5491 *
5492 * @returns The new value.
5493 * @param pi64 Pointer to the value to increment.
5494 *
5495 * @remarks x86: Requires a Pentium or later.
5496 */
5497DECLINLINE(int64_t) ASMAtomicIncS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5498{
5499 return (int64_t)ASMAtomicIncU64((uint64_t volatile RT_FAR *)pi64);
5500}
5501
5502
5503/**
5504 * Atomically increment a size_t value, ordered.
5505 *
5506 * @returns The new value.
5507 * @param pcb Pointer to the value to increment.
5508 *
5509 * @remarks x86: Requires a 486 or later.
5510 */
5511DECLINLINE(size_t) ASMAtomicIncZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5512{
5513#if ARCH_BITS == 64
5514 return ASMAtomicIncU64((uint64_t volatile RT_FAR *)pcb);
5515#elif ARCH_BITS == 32
5516 return ASMAtomicIncU32((uint32_t volatile RT_FAR *)pcb);
5517#elif ARCH_BITS == 16
5518 return ASMAtomicIncU16((uint16_t volatile RT_FAR *)pcb);
5519#else
5520# error "Unsupported ARCH_BITS value"
5521#endif
5522}
5523
5524
5525
5526/**
5527 * Atomically decrement an unsigned 32-bit value, ordered.
5528 *
5529 * @returns The new value.
5530 * @param pu16 Pointer to the value to decrement.
5531 * @remarks Not implemented. Just to make 16-bit code happy.
5532 *
5533 * @remarks x86: Requires a 486 or later.
5534 */
5535RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU16(uint16_t volatile RT_FAR *pu16) RT_NOTHROW_PROTO;
5536
5537
5538/**
5539 * Atomically decrement an unsigned 32-bit value, ordered.
5540 *
5541 * @returns The new value.
5542 * @param pu32 Pointer to the value to decrement.
5543 *
5544 * @remarks x86: Requires a 486 or later.
5545 */
5546#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5547RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
5548#else
5549DECLINLINE(uint32_t) ASMAtomicDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
5550{
5551# if RT_INLINE_ASM_USES_INTRIN
5552 return (uint32_t)_InterlockedDecrement((long RT_FAR *)pu32);
5553
5554# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5555# if RT_INLINE_ASM_GNU_STYLE
5556 uint32_t u32;
5557 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
5558 : "=r" (u32)
5559 , "=m" (*pu32)
5560 : "0" (-1)
5561 , "m" (*pu32)
5562 : "memory"
5563 , "cc");
5564 return u32-1;
5565# else
5566 uint32_t u32;
5567 __asm
5568 {
5569 mov eax, -1
5570# ifdef RT_ARCH_AMD64
5571 mov rdx, [pu32]
5572 lock xadd [rdx], eax
5573# else
5574 mov edx, [pu32]
5575 lock xadd [edx], eax
5576# endif
5577 mov u32, eax
5578 }
5579 return u32-1;
5580# endif
5581
5582# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5583 /* M1 benchmark: ldaddal=6887 vs dmb+ldadd=2120 vs non-lse=6260 (ps/call) */
5584# if defined(RTASM_ARM64_USE_FEAT_LSE)
5585 uint32_t u32NewRet;
5586 __asm__ __volatile__("Lstart_ASMAtomicDecU32_%=:\n\t"
5587# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5588 "ldaddal %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5589# else
5590 RTASM_ARM_DMB_SY
5591 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
5592# endif
5593 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
5594 : [pMem] "+Q" (*pu32)
5595 , [uNewRet] "=&r" (u32NewRet)
5596 : [uAddend] "r" (~(uint32_t)0)
5597 : );
5598# else
5599 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicDecU32, pu32, DMB_SY,
5600 "sub %w[uNew], %w[uNew], #1\n\t",
5601 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
5602 "X" (0) /* dummy */);
5603# endif
5604 return u32NewRet;
5605
5606# else
5607 return ASMAtomicSubU32(pu32, 1) - (uint32_t)1;
5608# endif
5609}
5610#endif
5611
5612
5613/**
5614 * Atomically decrement a signed 32-bit value, ordered.
5615 *
5616 * @returns The new value.
5617 * @param pi32 Pointer to the value to decrement.
5618 *
5619 * @remarks x86: Requires a 486 or later.
5620 */
5621DECLINLINE(int32_t) ASMAtomicDecS32(int32_t volatile RT_FAR *pi32) RT_NOTHROW_DEF
5622{
5623 return (int32_t)ASMAtomicDecU32((uint32_t volatile RT_FAR *)pi32);
5624}
5625
5626
5627/**
5628 * Atomically decrement an unsigned 64-bit value, ordered.
5629 *
5630 * @returns The new value.
5631 * @param pu64 Pointer to the value to decrement.
5632 *
5633 * @remarks x86: Requires a Pentium or later.
5634 */
5635#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5636RT_ASM_DECL_PRAGMA_WATCOM(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_PROTO;
5637#else
5638DECLINLINE(uint64_t) ASMAtomicDecU64(uint64_t volatile RT_FAR *pu64) RT_NOTHROW_DEF
5639{
5640# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5641 return (uint64_t)_InterlockedDecrement64((__int64 volatile RT_FAR *)pu64);
5642
5643# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5644 uint64_t u64;
5645 __asm__ __volatile__("lock; xaddq %q0, %1\n\t"
5646 : "=r" (u64)
5647 , "=m" (*pu64)
5648 : "0" (~(uint64_t)0)
5649 , "m" (*pu64)
5650 : "memory"
5651 , "cc");
5652 return u64-1;
5653
5654# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5655# if defined(RTASM_ARM64_USE_FEAT_LSE)
5656 uint64_t u64NewRet;
5657 __asm__ __volatile__("Lstart_ASMAtomicDecU64_%=:\n\t"
5658# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5659 "ldaddal %[uAddend], %[uNewRet], %[pMem]\n\t"
5660# else
5661 RTASM_ARM_DMB_SY
5662 "ldadd %[uAddend], %[uNewRet], %[pMem]\n\t"
5663# endif
5664 "sub %[uNewRet], %[uNewRet], #1\n\t"
5665 : [pMem] "+Q" (*pu64)
5666 , [uNewRet] "=&r" (u64NewRet)
5667 : [uAddend] "r" (~(uint64_t)0)
5668 : );
5669# else
5670 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicDecU64, pu64, DMB_SY,
5671 "sub %[uNew], %[uNew], #1\n\t"
5672 ,
5673 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */
5674 "sbc %H[uNew], %H[uNew], %[uZeroVal]\n\t",
5675 RTASM_ARM_PICK_6432("X" (0) /* dummy */, [uZeroVal] "r" (0)) );
5676# endif
5677 return u64NewRet;
5678
5679# else
5680 return ASMAtomicAddU64(pu64, UINT64_MAX) - 1;
5681# endif
5682}
5683#endif
5684
5685
5686/**
5687 * Atomically decrement a signed 64-bit value, ordered.
5688 *
5689 * @returns The new value.
5690 * @param pi64 Pointer to the value to decrement.
5691 *
5692 * @remarks x86: Requires a Pentium or later.
5693 */
5694DECLINLINE(int64_t) ASMAtomicDecS64(int64_t volatile RT_FAR *pi64) RT_NOTHROW_DEF
5695{
5696 return (int64_t)ASMAtomicDecU64((uint64_t volatile RT_FAR *)pi64);
5697}
5698
5699
5700/**
5701 * Atomically decrement a size_t value, ordered.
5702 *
5703 * @returns The new value.
5704 * @param pcb Pointer to the value to decrement.
5705 *
5706 * @remarks x86: Requires a 486 or later.
5707 */
5708DECLINLINE(size_t) ASMAtomicDecZ(size_t volatile RT_FAR *pcb) RT_NOTHROW_DEF
5709{
5710#if ARCH_BITS == 64
5711 return ASMAtomicDecU64((uint64_t volatile RT_FAR *)pcb);
5712#elif ARCH_BITS == 32
5713 return ASMAtomicDecU32((uint32_t volatile RT_FAR *)pcb);
5714#elif ARCH_BITS == 16
5715 return ASMAtomicDecU16((uint16_t volatile RT_FAR *)pcb);
5716#else
5717# error "Unsupported ARCH_BITS value"
5718#endif
5719}
5720
5721
5722/**
5723 * Atomically Or an unsigned 32-bit value, ordered.
5724 *
5725 * @param pu32 Pointer to the pointer variable to OR u32 with.
5726 * @param u32 The value to OR *pu32 with.
5727 *
5728 * @remarks x86: Requires a 386 or later.
5729 */
5730#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5731RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5732#else
5733DECLINLINE(void) ASMAtomicOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5734{
5735# if RT_INLINE_ASM_USES_INTRIN
5736 _InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5737
5738# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5739# if RT_INLINE_ASM_GNU_STYLE
5740 __asm__ __volatile__("lock; orl %1, %0\n\t"
5741 : "=m" (*pu32)
5742 : "ir" (u32)
5743 , "m" (*pu32)
5744 : "cc");
5745# else
5746 __asm
5747 {
5748 mov eax, [u32]
5749# ifdef RT_ARCH_AMD64
5750 mov rdx, [pu32]
5751 lock or [rdx], eax
5752# else
5753 mov edx, [pu32]
5754 lock or [edx], eax
5755# endif
5756 }
5757# endif
5758
5759# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5760# if defined(RTASM_ARM64_USE_FEAT_LSE)
5761# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5762 uint32_t u32Spill;
5763 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5764 "ldsetal %w[fBitsToSet], %w[uSpill], %[pMem]\n\t"
5765 : [pMem] "+Q" (*pu32)
5766 , [uSpill] "=&r" (u32Spill)
5767 : [fBitsToSet] "r" (u32)
5768 : );
5769# else
5770 __asm__ __volatile__("Lstart_ASMAtomicOrU32_%=:\n\t"
5771 RTASM_ARM_DMB_SY
5772 "stset %w[fBitsToSet], %[pMem]\n\t"
5773 : [pMem] "+Q" (*pu32)
5774 : [fBitsToSet] "r" (u32)
5775 : );
5776# endif
5777# else
5778 /* For more on Orr see https://en.wikipedia.org/wiki/Orr_(Catch-22) ;-) */
5779 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicOr32, pu32, DMB_SY,
5780 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
5781 "orr %[uNew], %[uNew], %[uVal]\n\t",
5782 [uVal] "r" (u32));
5783
5784# endif
5785# else
5786# error "Port me"
5787# endif
5788}
5789#endif
5790
5791
5792/**
5793 * Atomically OR an unsigned 32-bit value, ordered, extended version (for bitmap
5794 * fallback).
5795 *
5796 * @returns Old value.
5797 * @param pu32 Pointer to the variable to OR @a u32 with.
5798 * @param u32 The value to OR @a *pu32 with.
5799 */
5800DECLINLINE(uint32_t) ASMAtomicOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5801{
5802#if RT_INLINE_ASM_USES_INTRIN
5803 return (uint32_t)_InterlockedOr((long volatile RT_FAR *)pu32, (long)u32);
5804
5805#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5806# if defined(RTASM_ARM64_USE_FEAT_LSE)
5807 uint32_t u32OldRet;
5808 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
5809# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5810 "ldsetal %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5811# else
5812 RTASM_ARM_DMB_SY
5813 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
5814# endif
5815 : [pMem] "+Q" (*pu32)
5816 , [uOldRet] "=&r" (u32OldRet)
5817 : [fBitsToSet] "r" (u32)
5818 : );
5819# else
5820 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicOrEx32, pu32, DMB_SY,
5821 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
5822 "orr %[uNew], %[uOld], %[uVal]\n\t",
5823 [uVal] "r" (u32));
5824# endif
5825 return u32OldRet;
5826
5827#else
5828 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
5829 uint32_t u32New;
5830 do
5831 u32New = u32RetOld | u32;
5832 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
5833 return u32RetOld;
5834#endif
5835}
5836
5837
5838/**
5839 * Atomically Or a signed 32-bit value, ordered.
5840 *
5841 * @param pi32 Pointer to the pointer variable to OR u32 with.
5842 * @param i32 The value to OR *pu32 with.
5843 *
5844 * @remarks x86: Requires a 386 or later.
5845 */
5846DECLINLINE(void) ASMAtomicOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
5847{
5848 ASMAtomicOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
5849}
5850
5851
5852/**
5853 * Atomically Or an unsigned 64-bit value, ordered.
5854 *
5855 * @param pu64 Pointer to the pointer variable to OR u64 with.
5856 * @param u64 The value to OR *pu64 with.
5857 *
5858 * @remarks x86: Requires a Pentium or later.
5859 */
5860#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5861DECLASM(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
5862#else
5863DECLINLINE(void) ASMAtomicOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
5864{
5865# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
5866 _InterlockedOr64((__int64 volatile RT_FAR *)pu64, (__int64)u64);
5867
5868# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
5869 __asm__ __volatile__("lock; orq %1, %q0\n\t"
5870 : "=m" (*pu64)
5871 : "r" (u64)
5872 , "m" (*pu64)
5873 : "cc");
5874
5875# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5876# if defined(RTASM_ARM64_USE_FEAT_LSE)
5877# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5878 uint64_t u64Spill;
5879 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5880 "ldsetal %[fBitsToSet], %[uSpill], %[pMem]\n\t"
5881 : [pMem] "+Q" (*pu64)
5882 , [uSpill] "=&r" (u64Spill)
5883 : [fBitsToSet] "r" (u64)
5884 : );
5885# else
5886 __asm__ __volatile__("Lstart_ASMAtomicOrU64_%=:\n\t"
5887 RTASM_ARM_DMB_SY
5888 "stset %[fBitsToSet], %[pMem]\n\t"
5889 : [pMem] "+Q" (*pu64)
5890 : [fBitsToSet] "r" (u64)
5891 : );
5892# endif
5893# else
5894 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicOrU64, pu64, DMB_SY,
5895 "orr %[uNew], %[uNew], %[uVal]\n\t"
5896 ,
5897 "orr %[uNew], %[uNew], %[uVal]\n\t"
5898 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
5899 [uVal] "r" (u64));
5900# endif
5901
5902# else
5903 for (;;)
5904 {
5905 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
5906 uint64_t u64New = u64Old | u64;
5907 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
5908 break;
5909 ASMNopPause();
5910 }
5911# endif
5912}
5913#endif
5914
5915
5916/**
5917 * Atomically Or a signed 64-bit value, ordered.
5918 *
5919 * @param pi64 Pointer to the pointer variable to OR u64 with.
5920 * @param i64 The value to OR *pu64 with.
5921 *
5922 * @remarks x86: Requires a Pentium or later.
5923 */
5924DECLINLINE(void) ASMAtomicOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
5925{
5926 ASMAtomicOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
5927}
5928
5929
5930/**
5931 * Atomically And an unsigned 32-bit value, ordered.
5932 *
5933 * @param pu32 Pointer to the pointer variable to AND u32 with.
5934 * @param u32 The value to AND *pu32 with.
5935 *
5936 * @remarks x86: Requires a 386 or later.
5937 */
5938#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
5939RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
5940#else
5941DECLINLINE(void) ASMAtomicAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
5942{
5943# if RT_INLINE_ASM_USES_INTRIN
5944 _InterlockedAnd((long volatile RT_FAR *)pu32, u32);
5945
5946# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
5947# if RT_INLINE_ASM_GNU_STYLE
5948 __asm__ __volatile__("lock; andl %1, %0\n\t"
5949 : "=m" (*pu32)
5950 : "ir" (u32)
5951 , "m" (*pu32)
5952 : "cc");
5953# else
5954 __asm
5955 {
5956 mov eax, [u32]
5957# ifdef RT_ARCH_AMD64
5958 mov rdx, [pu32]
5959 lock and [rdx], eax
5960# else
5961 mov edx, [pu32]
5962 lock and [edx], eax
5963# endif
5964 }
5965# endif
5966
5967# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
5968# if defined(RTASM_ARM64_USE_FEAT_LSE)
5969# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
5970 uint32_t u32Spill;
5971 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5972 "ldclral %w[fBitsToClear], %w[uSpill], %[pMem]\n\t"
5973 : [pMem] "+Q" (*pu32)
5974 , [uSpill] "=&r" (u32Spill)
5975 : [fBitsToClear] "r" (~u32)
5976 : );
5977# else
5978 __asm__ __volatile__("Lstart_ASMAtomicAndU32_%=:\n\t"
5979 RTASM_ARM_DMB_SY
5980 "stclr %w[fBitsToClear], %[pMem]\n\t"
5981 : [pMem] "+Q" (*pu32)
5982 : [fBitsToClear] "r" (~u32)
5983 : );
5984# endif
5985# else
5986 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicAnd32, pu32, DMB_SY,
5987 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
5988 "and %[uNew], %[uNew], %[uVal]\n\t",
5989 [uVal] "r" (u32));
5990
5991# endif
5992# else
5993# error "Port me"
5994# endif
5995}
5996#endif
5997
5998
5999/**
6000 * Atomically AND an unsigned 32-bit value, ordered, extended version.
6001 *
6002 * @returns Old value.
6003 * @param pu32 Pointer to the variable to AND @a u32 with.
6004 * @param u32 The value to AND @a *pu32 with.
6005 */
6006DECLINLINE(uint32_t) ASMAtomicAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6007{
6008#if RT_INLINE_ASM_USES_INTRIN
6009 return (uint32_t)_InterlockedAnd((long volatile RT_FAR *)pu32, (long)u32);
6010
6011#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6012# if defined(RTASM_ARM64_USE_FEAT_LSE)
6013 uint32_t u32OldRet;
6014 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6015# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6016 "ldclral %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6017# else
6018 RTASM_ARM_DMB_SY
6019 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6020# endif
6021 : [pMem] "+Q" (*pu32)
6022 , [uOldRet] "=&r" (u32OldRet)
6023 : [fBitsToClear] "r" (~u32)
6024 : );
6025# else
6026 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicAndEx32, pu32, DMB_SY,
6027 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6028 "and %[uNew], %[uOld], %[uVal]\n\t",
6029 [uVal] "r" (u32));
6030# endif
6031 return u32OldRet;
6032
6033#else
6034 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6035 uint32_t u32New;
6036 do
6037 u32New = u32RetOld & u32;
6038 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6039 return u32RetOld;
6040#endif
6041}
6042
6043
6044/**
6045 * Atomically And a signed 32-bit value, ordered.
6046 *
6047 * @param pi32 Pointer to the pointer variable to AND i32 with.
6048 * @param i32 The value to AND *pi32 with.
6049 *
6050 * @remarks x86: Requires a 386 or later.
6051 */
6052DECLINLINE(void) ASMAtomicAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6053{
6054 ASMAtomicAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6055}
6056
6057
6058/**
6059 * Atomically And an unsigned 64-bit value, ordered.
6060 *
6061 * @param pu64 Pointer to the pointer variable to AND u64 with.
6062 * @param u64 The value to AND *pu64 with.
6063 *
6064 * @remarks x86: Requires a Pentium or later.
6065 */
6066#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6067DECLASM(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6068#else
6069DECLINLINE(void) ASMAtomicAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6070{
6071# if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64))
6072 _InterlockedAnd64((__int64 volatile RT_FAR *)pu64, u64);
6073
6074# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6075 __asm__ __volatile__("lock; andq %1, %0\n\t"
6076 : "=m" (*pu64)
6077 : "r" (u64)
6078 , "m" (*pu64)
6079 : "cc");
6080
6081# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6082# if defined(RTASM_ARM64_USE_FEAT_LSE)
6083# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6084 uint64_t u64Spill;
6085 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6086 "ldclral %[fBitsToClear], %[uSpill], %[pMem]\n\t"
6087 : [pMem] "+Q" (*pu64)
6088 , [uSpill] "=&r" (u64Spill)
6089 : [fBitsToClear] "r" (~u64)
6090 : );
6091# else
6092 __asm__ __volatile__("Lstart_ASMAtomicAndU64_%=:\n\t"
6093 RTASM_ARM_DMB_SY
6094 "stclr %[fBitsToClear], %[pMem]\n\t"
6095 : [pMem] "+Q" (*pu64)
6096 : [fBitsToClear] "r" (~u64)
6097 : );
6098# endif
6099# else
6100 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicAndU64, pu64, DMB_SY,
6101 "and %[uNew], %[uNew], %[uVal]\n\t"
6102 ,
6103 "and %[uNew], %[uNew], %[uVal]\n\t"
6104 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6105 [uVal] "r" (u64));
6106# endif
6107
6108# else
6109 for (;;)
6110 {
6111 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6112 uint64_t u64New = u64Old & u64;
6113 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6114 break;
6115 ASMNopPause();
6116 }
6117# endif
6118}
6119#endif
6120
6121
6122/**
6123 * Atomically And a signed 64-bit value, ordered.
6124 *
6125 * @param pi64 Pointer to the pointer variable to AND i64 with.
6126 * @param i64 The value to AND *pi64 with.
6127 *
6128 * @remarks x86: Requires a Pentium or later.
6129 */
6130DECLINLINE(void) ASMAtomicAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6131{
6132 ASMAtomicAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6133}
6134
6135
6136/**
6137 * Atomically XOR an unsigned 32-bit value and a memory location, ordered.
6138 *
6139 * @param pu32 Pointer to the variable to XOR @a u32 with.
6140 * @param u32 The value to XOR @a *pu32 with.
6141 *
6142 * @remarks x86: Requires a 386 or later.
6143 */
6144#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6145RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6146#else
6147DECLINLINE(void) ASMAtomicXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6148{
6149# if RT_INLINE_ASM_USES_INTRIN
6150 _InterlockedXor((long volatile RT_FAR *)pu32, u32);
6151
6152# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6153# if RT_INLINE_ASM_GNU_STYLE
6154 __asm__ __volatile__("lock; xorl %1, %0\n\t"
6155 : "=m" (*pu32)
6156 : "ir" (u32)
6157 , "m" (*pu32)
6158 : "cc");
6159# else
6160 __asm
6161 {
6162 mov eax, [u32]
6163# ifdef RT_ARCH_AMD64
6164 mov rdx, [pu32]
6165 lock xor [rdx], eax
6166# else
6167 mov edx, [pu32]
6168 lock xor [edx], eax
6169# endif
6170 }
6171# endif
6172
6173# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6174# if defined(RTASM_ARM64_USE_FEAT_LSE)
6175# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6176 uint32_t u32Spill;
6177 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6178 "ldeoral %w[fBitMask], %w[uSpill], %[pMem]\n\t"
6179 : [pMem] "+Q" (*pu32)
6180 , [uSpill] "=&r" (u32Spill)
6181 : [fBitMask] "r" (u32)
6182 : );
6183# else
6184 __asm__ __volatile__("Lstart_ASMAtomicXorU32_%=:\n\t"
6185 RTASM_ARM_DMB_SY
6186 "steor %w[fBitMask], %[pMem]\n\t"
6187 : [pMem] "+Q" (*pu32)
6188 : [fBitMask] "r" (u32)
6189 : );
6190# endif
6191# else
6192 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicXor32, pu32, DMB_SY,
6193 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6194 "eor %[uNew], %[uNew], %[uVal]\n\t",
6195 [uVal] "r" (u32));
6196# endif
6197
6198# else
6199# error "Port me"
6200# endif
6201}
6202#endif
6203
6204
6205/**
6206 * Atomically XOR an unsigned 32-bit value and a memory location, ordered,
6207 * extended version (for bitmaps).
6208 *
6209 * @returns Old value.
6210 * @param pu32 Pointer to the variable to XOR @a u32 with.
6211 * @param u32 The value to XOR @a *pu32 with.
6212 */
6213DECLINLINE(uint32_t) ASMAtomicXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6214{
6215# if RT_INLINE_ASM_USES_INTRIN
6216 return (uint32_t)_InterlockedXor((long volatile RT_FAR *)pu32, u32);
6217
6218#elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6219# if defined(RTASM_ARM64_USE_FEAT_LSE)
6220 uint32_t u32OldRet;
6221 __asm__ __volatile__("Lstart_ASMAtomicXorExU32_%=:\n\t"
6222# if defined(RTASM_ARM64_USE_FEAT_LSE_WITHOUT_DMB)
6223 "ldeoral %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6224# else
6225 RTASM_ARM_DMB_SY
6226 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6227# endif
6228 : [pMem] "+Q" (*pu32)
6229 , [uOldRet] "=&r" (u32OldRet)
6230 : [fBitMask] "r" (u32)
6231 : );
6232# else
6233 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicXorEx32, pu32, DMB_SY,
6234 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6235 "eor %[uNew], %[uOld], %[uVal]\n\t",
6236 [uVal] "r" (u32));
6237# endif
6238 return u32OldRet;
6239
6240#else
6241 uint32_t u32RetOld = ASMAtomicUoReadU32(pu32);
6242 uint32_t u32New;
6243 do
6244 u32New = u32RetOld ^ u32;
6245 while (!ASMAtomicCmpXchgExU32(pu32, u32New, u32RetOld, &u32RetOld));
6246 return u32RetOld;
6247#endif
6248}
6249
6250
6251/**
6252 * Atomically XOR a signed 32-bit value, ordered.
6253 *
6254 * @param pi32 Pointer to the variable to XOR i32 with.
6255 * @param i32 The value to XOR *pi32 with.
6256 *
6257 * @remarks x86: Requires a 386 or later.
6258 */
6259DECLINLINE(void) ASMAtomicXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6260{
6261 ASMAtomicXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6262}
6263
6264
6265/**
6266 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe.
6267 *
6268 * @param pu32 Pointer to the pointer variable to OR u32 with.
6269 * @param u32 The value to OR *pu32 with.
6270 *
6271 * @remarks x86: Requires a 386 or later.
6272 */
6273#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6274RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6275#else
6276DECLINLINE(void) ASMAtomicUoOrU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6277{
6278# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6279# if RT_INLINE_ASM_GNU_STYLE
6280 __asm__ __volatile__("orl %1, %0\n\t"
6281 : "=m" (*pu32)
6282 : "ir" (u32)
6283 , "m" (*pu32)
6284 : "cc");
6285# else
6286 __asm
6287 {
6288 mov eax, [u32]
6289# ifdef RT_ARCH_AMD64
6290 mov rdx, [pu32]
6291 or [rdx], eax
6292# else
6293 mov edx, [pu32]
6294 or [edx], eax
6295# endif
6296 }
6297# endif
6298
6299# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6300# if RT_INLINE_ASM_USES_INTRIN
6301 _InterlockedOr_nf((long volatile RT_FAR *)pu32, u32); /* similar to the non-lse code below */
6302
6303 /* M1 benchmark: stset=1974 vs non-lse=6271 */
6304# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6305 __asm__ __volatile__("Lstart_ASMAtomicUoOrU32_%=:\n\t"
6306 "stset %w[fBitsToSet], %[pMem]\n\t"
6307 : [pMem] "+Q" (*pu32)
6308 : [fBitsToSet] "r" (u32)
6309 : );
6310# else
6311 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoOrU32, pu32, NO_BARRIER,
6312 "orr %w[uNew], %w[uNew], %w[uVal]\n\t",
6313 "orr %[uNew], %[uNew], %[uVal]\n\t",
6314 [uVal] "r" (u32));
6315# endif
6316
6317# else
6318# error "Port me"
6319# endif
6320}
6321#endif
6322
6323
6324/**
6325 * Atomically OR an unsigned 32-bit value, unordered but interrupt safe,
6326 * extended version (for bitmap fallback).
6327 *
6328 * @returns Old value.
6329 * @param pu32 Pointer to the variable to OR @a u32 with.
6330 * @param u32 The value to OR @a *pu32 with.
6331 */
6332DECLINLINE(uint32_t) ASMAtomicUoOrExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6333{
6334#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6335# if RT_INLINE_ASM_USES_INTRIN
6336 return (uint32_t)_InterlockedOr_nf((long volatile RT_FAR *)pu32, u32); /* similar to the non-lse code below */
6337
6338# else
6339# if defined(RTASM_ARM64_USE_FEAT_LSE)
6340 uint32_t u32OldRet;
6341 __asm__ __volatile__("Lstart_ASMAtomicOrExU32_%=:\n\t"
6342 "ldset %w[fBitsToSet], %w[uOldRet], %[pMem]\n\t"
6343 : [pMem] "+Q" (*pu32)
6344 , [uOldRet] "=&r" (u32OldRet)
6345 : [fBitsToSet] "r" (u32)
6346 : );
6347# else
6348 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoOrExU32, pu32, NO_BARRIER,
6349 "orr %w[uNew], %w[uOld], %w[uVal]\n\t",
6350 "orr %[uNew], %[uOld], %[uVal]\n\t",
6351 [uVal] "r" (u32));
6352# endif
6353 return u32OldRet;
6354# endif
6355
6356#else
6357 return ASMAtomicOrExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6358#endif
6359}
6360
6361
6362/**
6363 * Atomically OR a signed 32-bit value, unordered.
6364 *
6365 * @param pi32 Pointer to the pointer variable to OR u32 with.
6366 * @param i32 The value to OR *pu32 with.
6367 *
6368 * @remarks x86: Requires a 386 or later.
6369 */
6370DECLINLINE(void) ASMAtomicUoOrS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6371{
6372 ASMAtomicUoOrU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6373}
6374
6375
6376/**
6377 * Atomically OR an unsigned 64-bit value, unordered.
6378 *
6379 * @param pu64 Pointer to the pointer variable to OR u64 with.
6380 * @param u64 The value to OR *pu64 with.
6381 *
6382 * @remarks x86: Requires a Pentium or later.
6383 */
6384#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6385DECLASM(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6386#else
6387DECLINLINE(void) ASMAtomicUoOrU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6388{
6389# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6390 __asm__ __volatile__("orq %1, %q0\n\t"
6391 : "=m" (*pu64)
6392 : "r" (u64)
6393 , "m" (*pu64)
6394 : "cc");
6395
6396# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6397# if RT_INLINE_ASM_USES_INTRIN
6398 _InterlockedOr64_nf((volatile int64_t *)pu64, (int64_t)u64); /* similar to the non-lse code below */
6399
6400# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6401 __asm__ __volatile__("Lstart_ASMAtomicUoOrU64_%=:\n\t"
6402 "stset %[fBitsToSet], %[pMem]\n\t"
6403 : [pMem] "+Q" (*pu64)
6404 : [fBitsToSet] "r" (u64)
6405 : );
6406# else
6407 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoOrU64, pu64, NO_BARRIER,
6408 "orr %[uNew], %[uNew], %[uVal]\n\t"
6409 ,
6410 "orr %[uNew], %[uNew], %[uVal]\n\t"
6411 "orr %H[uNew], %H[uNew], %H[uVal]\n\t",
6412 [uVal] "r" (u64));
6413# endif
6414
6415# else
6416 for (;;)
6417 {
6418 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6419 uint64_t u64New = u64Old | u64;
6420 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6421 break;
6422 ASMNopPause();
6423 }
6424# endif
6425}
6426#endif
6427
6428
6429/**
6430 * Atomically Or a signed 64-bit value, unordered.
6431 *
6432 * @param pi64 Pointer to the pointer variable to OR u64 with.
6433 * @param i64 The value to OR *pu64 with.
6434 *
6435 * @remarks x86: Requires a Pentium or later.
6436 */
6437DECLINLINE(void) ASMAtomicUoOrS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6438{
6439 ASMAtomicUoOrU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6440}
6441
6442
6443/**
6444 * Atomically And an unsigned 32-bit value, unordered.
6445 *
6446 * @param pu32 Pointer to the pointer variable to AND u32 with.
6447 * @param u32 The value to AND *pu32 with.
6448 *
6449 * @remarks x86: Requires a 386 or later.
6450 */
6451#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6452RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6453#else
6454DECLINLINE(void) ASMAtomicUoAndU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6455{
6456# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6457# if RT_INLINE_ASM_GNU_STYLE
6458 __asm__ __volatile__("andl %1, %0\n\t"
6459 : "=m" (*pu32)
6460 : "ir" (u32)
6461 , "m" (*pu32)
6462 : "cc");
6463# else
6464 __asm
6465 {
6466 mov eax, [u32]
6467# ifdef RT_ARCH_AMD64
6468 mov rdx, [pu32]
6469 and [rdx], eax
6470# else
6471 mov edx, [pu32]
6472 and [edx], eax
6473# endif
6474 }
6475# endif
6476
6477# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6478# if RT_INLINE_ASM_USES_INTRIN
6479 _InterlockedAnd_nf((volatile long *)pu32, (long)u32); /* similar to the non-lse code below */
6480# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6481 /* M1 benchmark: stclr=1884 vs non-lse=6299 (ps/call) */
6482 __asm__ __volatile__("Lstart_ASMAtomicUoAndU32_%=:\n\t"
6483 "stclr %w[fBitsToClear], %[pMem]\n\t"
6484 : [pMem] "+Q" (*pu32)
6485 : [fBitsToClear] "r" (~u32)
6486 : );
6487# else
6488 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoAnd32, pu32, NO_BARRIER,
6489 "and %w[uNew], %w[uNew], %w[uVal]\n\t",
6490 "and %[uNew], %[uNew], %[uVal]\n\t",
6491 [uVal] "r" (u32));
6492# endif
6493
6494# else
6495# error "Port me"
6496# endif
6497}
6498#endif
6499
6500
6501/**
6502 * Atomically AND an unsigned 32-bit value, unordered, extended version (for
6503 * bitmap fallback).
6504 *
6505 * @returns Old value.
6506 * @param pu32 Pointer to the pointer to AND @a u32 with.
6507 * @param u32 The value to AND @a *pu32 with.
6508 */
6509DECLINLINE(uint32_t) ASMAtomicUoAndExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6510{
6511#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6512# if RT_INLINE_ASM_USES_INTRIN
6513 return (uint32_t)_InterlockedAnd_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6514# else
6515# if defined(RTASM_ARM64_USE_FEAT_LSE)
6516 uint32_t u32OldRet;
6517 __asm__ __volatile__("Lstart_ASMAtomicAndExU32_%=:\n\t"
6518 "ldclr %w[fBitsToClear], %w[uOldRet], %[pMem]\n\t"
6519 : [pMem] "+Q" (*pu32)
6520 , [uOldRet] "=&r" (u32OldRet)
6521 : [fBitsToClear] "r" (~u32)
6522 : );
6523# else
6524 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoAndEx32, pu32, NO_BARRIER,
6525 "and %w[uNew], %w[uOld], %w[uVal]\n\t",
6526 "and %[uNew], %[uOld], %[uVal]\n\t",
6527 [uVal] "r" (u32));
6528# endif
6529 return u32OldRet;
6530# endif
6531
6532#else
6533 return ASMAtomicAndExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6534#endif
6535}
6536
6537
6538/**
6539 * Atomically And a signed 32-bit value, unordered.
6540 *
6541 * @param pi32 Pointer to the pointer variable to AND i32 with.
6542 * @param i32 The value to AND *pi32 with.
6543 *
6544 * @remarks x86: Requires a 386 or later.
6545 */
6546DECLINLINE(void) ASMAtomicUoAndS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6547{
6548 ASMAtomicUoAndU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6549}
6550
6551
6552/**
6553 * Atomically And an unsigned 64-bit value, unordered.
6554 *
6555 * @param pu64 Pointer to the pointer variable to AND u64 with.
6556 * @param u64 The value to AND *pu64 with.
6557 *
6558 * @remarks x86: Requires a Pentium or later.
6559 */
6560#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6561DECLASM(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_PROTO;
6562#else
6563DECLINLINE(void) ASMAtomicUoAndU64(uint64_t volatile RT_FAR *pu64, uint64_t u64) RT_NOTHROW_DEF
6564{
6565# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6566 __asm__ __volatile__("andq %1, %0\n\t"
6567 : "=m" (*pu64)
6568 : "r" (u64)
6569 , "m" (*pu64)
6570 : "cc");
6571
6572# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6573# if RT_INLINE_ASM_USES_INTRIN
6574 _InterlockedAnd64_nf((volatile int64_t *)pu64, (int64_t)u64); /* similar code to the non-lse case below */
6575
6576# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6577 __asm__ __volatile__("Lstart_ASMAtomicUoAndU64_%=:\n\t"
6578 "stclr %[fBitsToClear], %[pMem]\n\t"
6579 : [pMem] "+Q" (*pu64)
6580 : [fBitsToClear] "r" (~u64)
6581 : );
6582# else
6583 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_64(ASMAtomicUoAndU64, pu64, NO_BARRIER,
6584 "and %[uNew], %[uNew], %[uVal]\n\t"
6585 ,
6586 "and %[uNew], %[uNew], %[uVal]\n\t"
6587 "and %H[uNew], %H[uNew], %H[uVal]\n\t",
6588 [uVal] "r" (u64));
6589# endif
6590
6591# else
6592 for (;;)
6593 {
6594 uint64_t u64Old = ASMAtomicUoReadU64(pu64);
6595 uint64_t u64New = u64Old & u64;
6596 if (ASMAtomicCmpXchgU64(pu64, u64New, u64Old))
6597 break;
6598 ASMNopPause();
6599 }
6600# endif
6601}
6602#endif
6603
6604
6605/**
6606 * Atomically And a signed 64-bit value, unordered.
6607 *
6608 * @param pi64 Pointer to the pointer variable to AND i64 with.
6609 * @param i64 The value to AND *pi64 with.
6610 *
6611 * @remarks x86: Requires a Pentium or later.
6612 */
6613DECLINLINE(void) ASMAtomicUoAndS64(int64_t volatile RT_FAR *pi64, int64_t i64) RT_NOTHROW_DEF
6614{
6615 ASMAtomicUoAndU64((uint64_t volatile RT_FAR *)pi64, (uint64_t)i64);
6616}
6617
6618
6619/**
6620 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe.
6621 *
6622 * @param pu32 Pointer to the variable to XOR @a u32 with.
6623 * @param u32 The value to OR @a *pu32 with.
6624 *
6625 * @remarks x86: Requires a 386 or later.
6626 */
6627#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6628RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_PROTO;
6629#else
6630DECLINLINE(void) ASMAtomicUoXorU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6631{
6632# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6633# if RT_INLINE_ASM_GNU_STYLE
6634 __asm__ __volatile__("xorl %1, %0\n\t"
6635 : "=m" (*pu32)
6636 : "ir" (u32)
6637 , "m" (*pu32)
6638 : "cc");
6639# else
6640 __asm
6641 {
6642 mov eax, [u32]
6643# ifdef RT_ARCH_AMD64
6644 mov rdx, [pu32]
6645 xor [rdx], eax
6646# else
6647 mov edx, [pu32]
6648 xor [edx], eax
6649# endif
6650 }
6651# endif
6652
6653# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6654# if RT_INLINE_ASM_USES_INTRIN
6655 _InterlockedXor_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6656# elif defined(RTASM_ARM64_USE_FEAT_LSE)
6657 __asm__ __volatile__("Lstart_ASMAtomicUoXorU32_%=:\n\t"
6658 "steor %w[fBitMask], %[pMem]\n\t"
6659 : [pMem] "+Q" (*pu32)
6660 : [fBitMask] "r" (u32)
6661 : );
6662# else
6663 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoXorU32, pu32, NO_BARRIER,
6664 "eor %w[uNew], %w[uNew], %w[uVal]\n\t",
6665 "eor %[uNew], %[uNew], %[uVal]\n\t",
6666 [uVal] "r" (u32));
6667# endif
6668
6669# else
6670# error "Port me"
6671# endif
6672}
6673#endif
6674
6675
6676/**
6677 * Atomically XOR an unsigned 32-bit value, unordered but interrupt safe,
6678 * extended version (for bitmap fallback).
6679 *
6680 * @returns Old value.
6681 * @param pu32 Pointer to the variable to XOR @a u32 with.
6682 * @param u32 The value to OR @a *pu32 with.
6683 */
6684DECLINLINE(uint32_t) ASMAtomicUoXorExU32(uint32_t volatile RT_FAR *pu32, uint32_t u32) RT_NOTHROW_DEF
6685{
6686#if defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6687# if RT_INLINE_ASM_USES_INTRIN
6688 return (uint32_t)_InterlockedXor_nf((volatile long *)pu32, (long)u32); /* similar code to the non-lse case below */
6689# else
6690# if defined(RTASM_ARM64_USE_FEAT_LSE)
6691 uint32_t u32OldRet;
6692 __asm__ __volatile__("Lstart_ASMAtomicUoXorExU32_%=:\n\t"
6693 "ldeor %w[fBitMask], %w[uOldRet], %[pMem]\n\t"
6694 : [pMem] "+Q" (*pu32)
6695 , [uOldRet] "=&r" (u32OldRet)
6696 : [fBitMask] "r" (u32)
6697 : );
6698# else
6699 RTASM_ARM_LOAD_MODIFY_STORE_RET_OLD_32(ASMAtomicUoXorExU32, pu32, NO_BARRIER,
6700 "eor %w[uNew], %w[uOld], %w[uVal]\n\t",
6701 "eor %[uNew], %[uOld], %[uVal]\n\t",
6702 [uVal] "r" (u32));
6703# endif
6704 return u32OldRet;
6705# endif
6706
6707#else
6708 return ASMAtomicXorExU32(pu32, u32); /* (we have no unordered cmpxchg primitive atm.) */
6709#endif
6710}
6711
6712
6713/**
6714 * Atomically XOR a signed 32-bit value, unordered.
6715 *
6716 * @param pi32 Pointer to the variable to XOR @a u32 with.
6717 * @param i32 The value to XOR @a *pu32 with.
6718 *
6719 * @remarks x86: Requires a 386 or later.
6720 */
6721DECLINLINE(void) ASMAtomicUoXorS32(int32_t volatile RT_FAR *pi32, int32_t i32) RT_NOTHROW_DEF
6722{
6723 ASMAtomicUoXorU32((uint32_t volatile RT_FAR *)pi32, (uint32_t)i32);
6724}
6725
6726
6727/**
6728 * Atomically increment an unsigned 32-bit value, unordered.
6729 *
6730 * @returns the new value.
6731 * @param pu32 Pointer to the variable to increment.
6732 *
6733 * @remarks x86: Requires a 486 or later.
6734 */
6735#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6736RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6737#else
6738DECLINLINE(uint32_t) ASMAtomicUoIncU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6739{
6740# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6741 uint32_t u32;
6742# if RT_INLINE_ASM_GNU_STYLE
6743 __asm__ __volatile__("xaddl %0, %1\n\t"
6744 : "=r" (u32)
6745 , "=m" (*pu32)
6746 : "0" (1)
6747 , "m" (*pu32)
6748 : "memory" /** @todo why 'memory'? */
6749 , "cc");
6750 return u32 + 1;
6751# else
6752 __asm
6753 {
6754 mov eax, 1
6755# ifdef RT_ARCH_AMD64
6756 mov rdx, [pu32]
6757 xadd [rdx], eax
6758# else
6759 mov edx, [pu32]
6760 xadd [edx], eax
6761# endif
6762 mov u32, eax
6763 }
6764 return u32 + 1;
6765# endif
6766
6767# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6768# if RT_INLINE_ASM_USES_INTRIN
6769 return _InterlockedIncrement_nf((volatile long *)pu32); /* generates code similar to the non-lse case below */
6770# else
6771 /* M1 benchmark: ldadd=2031 vs non-lse=6301 (ps/call) */
6772# if defined(RTASM_ARM64_USE_FEAT_LSE)
6773 uint32_t u32NewRet;
6774 __asm__ __volatile__("Lstart_ASMAtomicUoIncU32_%=:\n\t"
6775 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6776 "add %w[uNewRet], %w[uNewRet], #1\n\t"
6777 : [pMem] "+Q" (*pu32)
6778 , [uNewRet] "=&r" (u32NewRet)
6779 : [uAddend] "r" ((uint32_t)1)
6780 : );
6781# else
6782 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoIncU32, pu32, NO_BARRIER,
6783 "add %w[uNew], %w[uNew], #1\n\t",
6784 "add %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6785 "X" (0) /* dummy */);
6786# endif
6787 return u32NewRet;
6788# endif
6789
6790# else
6791# error "Port me"
6792# endif
6793}
6794#endif
6795
6796
6797/**
6798 * Atomically decrement an unsigned 32-bit value, unordered.
6799 *
6800 * @returns the new value.
6801 * @param pu32 Pointer to the variable to decrement.
6802 *
6803 * @remarks x86: Requires a 486 or later.
6804 */
6805#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
6806RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_PROTO;
6807#else
6808DECLINLINE(uint32_t) ASMAtomicUoDecU32(uint32_t volatile RT_FAR *pu32) RT_NOTHROW_DEF
6809{
6810# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6811 uint32_t u32;
6812# if RT_INLINE_ASM_GNU_STYLE
6813 __asm__ __volatile__("lock; xaddl %0, %1\n\t"
6814 : "=r" (u32)
6815 , "=m" (*pu32)
6816 : "0" (-1)
6817 , "m" (*pu32)
6818 : "memory"
6819 , "cc");
6820 return u32 - 1;
6821# else
6822 __asm
6823 {
6824 mov eax, -1
6825# ifdef RT_ARCH_AMD64
6826 mov rdx, [pu32]
6827 xadd [rdx], eax
6828# else
6829 mov edx, [pu32]
6830 xadd [edx], eax
6831# endif
6832 mov u32, eax
6833 }
6834 return u32 - 1;
6835# endif
6836
6837# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6838# if RT_INLINE_ASM_USES_INTRIN
6839 return _InterlockedDecrement_nf((volatile long *)pu32); /* generates code similar to the non-lse case below */
6840# else
6841 /* M1 benchmark: ldadd=2101 vs non-lse=6268 (ps/call) */
6842# if defined(RTASM_ARM64_USE_FEAT_LSE)
6843 uint32_t u32NewRet;
6844 __asm__ __volatile__("Lstart_ASMAtomicUoDecU32_%=:\n\t"
6845 "ldadd %w[uAddend], %w[uNewRet], %[pMem]\n\t"
6846 "sub %w[uNewRet], %w[uNewRet], #1\n\t"
6847 : [pMem] "+Q" (*pu32)
6848 , [uNewRet] "=&r" (u32NewRet)
6849 : [uAddend] "r" (~(uint32_t)0)
6850 : );
6851# else
6852 RTASM_ARM_LOAD_MODIFY_STORE_RET_NEW_32(ASMAtomicUoDecU32, pu32, NO_BARRIER,
6853 "sub %w[uNew], %w[uNew], #1\n\t",
6854 "sub %[uNew], %[uNew], #1\n\t" /* arm6 / thumb2+ */,
6855 "X" (0) /* dummy */);
6856# endif
6857 return u32NewRet;
6858# endif
6859
6860# else
6861# error "Port me"
6862# endif
6863}
6864#endif
6865
6866/** @todo Move ASMByteSwapU16, ASMByteSwapU32 and ASMByteSwapU64 in their own
6867 * header as it's a common reason for including asm.h. */
6868
6869
6870/**
6871 * Reverse the byte order of the given 16-bit integer.
6872 *
6873 * @returns Revert
6874 * @param u16 16-bit integer value.
6875 */
6876#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6877RT_ASM_DECL_PRAGMA_WATCOM(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_PROTO;
6878#else
6879DECLINLINE(uint16_t) ASMByteSwapU16(uint16_t u16) RT_NOTHROW_DEF
6880{
6881# if RT_INLINE_ASM_USES_INTRIN
6882 return _byteswap_ushort(u16);
6883
6884# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6885# if RT_INLINE_ASM_GNU_STYLE
6886 __asm__ ("rorw $8, %0" : "=r" (u16) : "0" (u16) : "cc");
6887# else
6888 _asm
6889 {
6890 mov ax, [u16]
6891 ror ax, 8
6892 mov [u16], ax
6893 }
6894# endif
6895 return u16;
6896
6897# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
6898 uint32_t u32Ret;
6899 __asm__ __volatile__(
6900# if defined(RT_ARCH_ARM64)
6901 "rev16 %w[uRet], %w[uVal]\n\t"
6902# else
6903 "rev16 %[uRet], %[uVal]\n\t"
6904# endif
6905 : [uRet] "=r" (u32Ret)
6906 : [uVal] "r" (u16));
6907 return (uint16_t)u32Ret;
6908
6909# else
6910# error "Port me"
6911# endif
6912}
6913#endif
6914
6915
6916/**
6917 * Reverse the byte order of the given 32-bit integer.
6918 *
6919 * @returns Revert
6920 * @param u32 32-bit integer value.
6921 */
6922#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
6923RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_PROTO;
6924#else
6925DECLINLINE(uint32_t) ASMByteSwapU32(uint32_t u32) RT_NOTHROW_DEF
6926{
6927# if RT_INLINE_ASM_USES_INTRIN
6928 return _byteswap_ulong(u32);
6929
6930# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
6931# if RT_INLINE_ASM_GNU_STYLE
6932 __asm__ ("bswapl %0" : "=r" (u32) : "0" (u32));
6933# else
6934 _asm
6935 {
6936 mov eax, [u32]
6937 bswap eax
6938 mov [u32], eax
6939 }
6940# endif
6941 return u32;
6942
6943# elif defined(RT_ARCH_ARM64)
6944 uint64_t u64Ret;
6945 __asm__ __volatile__("rev32 %[uRet], %[uVal]\n\t"
6946 : [uRet] "=r" (u64Ret)
6947 : [uVal] "r" ((uint64_t)u32));
6948 return (uint32_t)u64Ret;
6949
6950# elif defined(RT_ARCH_ARM32)
6951 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6952 : [uRet] "=r" (u32)
6953 : [uVal] "[uRet]" (u32));
6954 return u32;
6955
6956# else
6957# error "Port me"
6958# endif
6959}
6960#endif
6961
6962
6963/**
6964 * Reverse the byte order of the given 64-bit integer.
6965 *
6966 * @returns Revert
6967 * @param u64 64-bit integer value.
6968 */
6969DECLINLINE(uint64_t) ASMByteSwapU64(uint64_t u64) RT_NOTHROW_DEF
6970{
6971#if RT_INLINE_ASM_USES_INTRIN && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32))
6972 return _byteswap_uint64(u64);
6973
6974# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
6975 __asm__ ("bswapq %0" : "=r" (u64) : "0" (u64));
6976 return u64;
6977
6978# elif defined(RT_ARCH_ARM64)
6979 __asm__ __volatile__("rev %[uRet], %[uVal]\n\t"
6980 : [uRet] "=r" (u64)
6981 : [uVal] "[uRet]" (u64));
6982 return u64;
6983
6984#else
6985 return (uint64_t)ASMByteSwapU32((uint32_t)u64) << 32
6986 | (uint64_t)ASMByteSwapU32((uint32_t)(u64 >> 32));
6987#endif
6988}
6989
6990
6991
6992/** @defgroup grp_inline_bits Bitmap Operations
6993 *
6994 * @todo Move these into a separate header, with standard IPRT prefix
6995 * (RTBitmapXxx). Move the more complex (searched) stuff into C source
6996 * files.
6997 *
6998 * @{
6999 */
7000
7001
7002/**
7003 * Sets a bit in a bitmap.
7004 *
7005 * @param pvBitmap Pointer to the bitmap (little endian). This should be
7006 * 32-bit aligned.
7007 * @param iBit The bit to set.
7008 *
7009 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7010 * However, doing so will yield better performance as well as avoiding
7011 * traps accessing the last bits in the bitmap.
7012 */
7013#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7014RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7015#else
7016DECLINLINE(void) ASMBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7017{
7018# if RT_INLINE_ASM_USES_INTRIN
7019 _bittestandset((long RT_FAR *)pvBitmap, iBit);
7020
7021# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7022# if RT_INLINE_ASM_GNU_STYLE
7023 __asm__ __volatile__("btsl %1, %0"
7024 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7025 : "Ir" (iBit)
7026 , "m" (*(volatile long RT_FAR *)pvBitmap)
7027 : "memory"
7028 , "cc");
7029# else
7030 __asm
7031 {
7032# ifdef RT_ARCH_AMD64
7033 mov rax, [pvBitmap]
7034 mov edx, [iBit]
7035 bts [rax], edx
7036# else
7037 mov eax, [pvBitmap]
7038 mov edx, [iBit]
7039 bts [eax], edx
7040# endif
7041 }
7042# endif
7043
7044# else
7045 int32_t offBitmap = iBit / 32;
7046 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7047 ASMAtomicUoOrU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7048# endif
7049}
7050#endif
7051
7052
7053/**
7054 * Atomically sets a bit in a bitmap, ordered.
7055 *
7056 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7057 * aligned, otherwise the memory access isn't atomic!
7058 * @param iBit The bit to set.
7059 *
7060 * @remarks x86: Requires a 386 or later.
7061 */
7062#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7063RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7064#else
7065DECLINLINE(void) ASMAtomicBitSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7066{
7067 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7068# if RT_INLINE_ASM_USES_INTRIN
7069 _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7070
7071# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7072# if RT_INLINE_ASM_GNU_STYLE
7073 __asm__ __volatile__("lock; btsl %1, %0"
7074 : "=m" (*(volatile long *)pvBitmap)
7075 : "Ir" (iBit)
7076 , "m" (*(volatile long *)pvBitmap)
7077 : "memory"
7078 , "cc");
7079# else
7080 __asm
7081 {
7082# ifdef RT_ARCH_AMD64
7083 mov rax, [pvBitmap]
7084 mov edx, [iBit]
7085 lock bts [rax], edx
7086# else
7087 mov eax, [pvBitmap]
7088 mov edx, [iBit]
7089 lock bts [eax], edx
7090# endif
7091 }
7092# endif
7093
7094# else
7095 ASMAtomicOrU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7096# endif
7097}
7098#endif
7099
7100
7101/**
7102 * Clears a bit in a bitmap.
7103 *
7104 * @param pvBitmap Pointer to the bitmap (little endian).
7105 * @param iBit The bit to clear.
7106 *
7107 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7108 * However, doing so will yield better performance as well as avoiding
7109 * traps accessing the last bits in the bitmap.
7110 */
7111#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7112RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7113#else
7114DECLINLINE(void) ASMBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7115{
7116# if RT_INLINE_ASM_USES_INTRIN
7117 _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7118
7119# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7120# if RT_INLINE_ASM_GNU_STYLE
7121 __asm__ __volatile__("btrl %1, %0"
7122 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7123 : "Ir" (iBit)
7124 , "m" (*(volatile long RT_FAR *)pvBitmap)
7125 : "memory"
7126 , "cc");
7127# else
7128 __asm
7129 {
7130# ifdef RT_ARCH_AMD64
7131 mov rax, [pvBitmap]
7132 mov edx, [iBit]
7133 btr [rax], edx
7134# else
7135 mov eax, [pvBitmap]
7136 mov edx, [iBit]
7137 btr [eax], edx
7138# endif
7139 }
7140# endif
7141
7142# else
7143 int32_t offBitmap = iBit / 32;
7144 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7145 ASMAtomicUoAndU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7146# endif
7147}
7148#endif
7149
7150
7151/**
7152 * Atomically clears a bit in a bitmap, ordered.
7153 *
7154 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7155 * aligned, otherwise the memory access isn't atomic!
7156 * @param iBit The bit to toggle set.
7157 *
7158 * @remarks No memory barrier, take care on smp.
7159 * @remarks x86: Requires a 386 or later.
7160 */
7161#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7162RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7163#else
7164DECLINLINE(void) ASMAtomicBitClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7165{
7166 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7167# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7168# if RT_INLINE_ASM_GNU_STYLE
7169 __asm__ __volatile__("lock; btrl %1, %0"
7170 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7171 : "Ir" (iBit)
7172 , "m" (*(volatile long RT_FAR *)pvBitmap)
7173 : "memory"
7174 , "cc");
7175# else
7176 __asm
7177 {
7178# ifdef RT_ARCH_AMD64
7179 mov rax, [pvBitmap]
7180 mov edx, [iBit]
7181 lock btr [rax], edx
7182# else
7183 mov eax, [pvBitmap]
7184 mov edx, [iBit]
7185 lock btr [eax], edx
7186# endif
7187 }
7188# endif
7189# else
7190 ASMAtomicAndU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31)));
7191# endif
7192}
7193#endif
7194
7195
7196/**
7197 * Toggles a bit in a bitmap.
7198 *
7199 * @param pvBitmap Pointer to the bitmap (little endian).
7200 * @param iBit The bit to toggle.
7201 *
7202 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7203 * However, doing so will yield better performance as well as avoiding
7204 * traps accessing the last bits in the bitmap.
7205 */
7206#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7207RT_ASM_DECL_PRAGMA_WATCOM(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7208#else
7209DECLINLINE(void) ASMBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7210{
7211# if RT_INLINE_ASM_USES_INTRIN
7212 _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7213# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7214# if RT_INLINE_ASM_GNU_STYLE
7215 __asm__ __volatile__("btcl %1, %0"
7216 : "=m" (*(volatile long *)pvBitmap)
7217 : "Ir" (iBit)
7218 , "m" (*(volatile long *)pvBitmap)
7219 : "memory"
7220 , "cc");
7221# else
7222 __asm
7223 {
7224# ifdef RT_ARCH_AMD64
7225 mov rax, [pvBitmap]
7226 mov edx, [iBit]
7227 btc [rax], edx
7228# else
7229 mov eax, [pvBitmap]
7230 mov edx, [iBit]
7231 btc [eax], edx
7232# endif
7233 }
7234# endif
7235# else
7236 int32_t offBitmap = iBit / 32;
7237 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7238 ASMAtomicUoXorU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7239# endif
7240}
7241#endif
7242
7243
7244/**
7245 * Atomically toggles a bit in a bitmap, ordered.
7246 *
7247 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7248 * aligned, otherwise the memory access isn't atomic!
7249 * @param iBit The bit to test and set.
7250 *
7251 * @remarks x86: Requires a 386 or later.
7252 */
7253#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7254RT_ASM_DECL_PRAGMA_WATCOM(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7255#else
7256DECLINLINE(void) ASMAtomicBitToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7257{
7258 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7259# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7260# if RT_INLINE_ASM_GNU_STYLE
7261 __asm__ __volatile__("lock; btcl %1, %0"
7262 : "=m" (*(volatile long RT_FAR *)pvBitmap)
7263 : "Ir" (iBit)
7264 , "m" (*(volatile long RT_FAR *)pvBitmap)
7265 : "memory"
7266 , "cc");
7267# else
7268 __asm
7269 {
7270# ifdef RT_ARCH_AMD64
7271 mov rax, [pvBitmap]
7272 mov edx, [iBit]
7273 lock btc [rax], edx
7274# else
7275 mov eax, [pvBitmap]
7276 mov edx, [iBit]
7277 lock btc [eax], edx
7278# endif
7279 }
7280# endif
7281# else
7282 ASMAtomicXorU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31)));
7283# endif
7284}
7285#endif
7286
7287
7288/**
7289 * Tests and sets a bit in a bitmap.
7290 *
7291 * @returns true if the bit was set.
7292 * @returns false if the bit was clear.
7293 *
7294 * @param pvBitmap Pointer to the bitmap (little endian).
7295 * @param iBit The bit to test and set.
7296 *
7297 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7298 * However, doing so will yield better performance as well as avoiding
7299 * traps accessing the last bits in the bitmap.
7300 */
7301#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7302RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7303#else
7304DECLINLINE(bool) ASMBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7305{
7306 union { bool f; uint32_t u32; uint8_t u8; } rc;
7307# if RT_INLINE_ASM_USES_INTRIN
7308 rc.u8 = _bittestandset((long RT_FAR *)pvBitmap, iBit);
7309
7310# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7311# if RT_INLINE_ASM_GNU_STYLE
7312 __asm__ __volatile__("btsl %2, %1\n\t"
7313 "setc %b0\n\t"
7314 "andl $1, %0\n\t"
7315 : "=q" (rc.u32)
7316 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7317 : "Ir" (iBit)
7318 , "m" (*(volatile long RT_FAR *)pvBitmap)
7319 : "memory"
7320 , "cc");
7321# else
7322 __asm
7323 {
7324 mov edx, [iBit]
7325# ifdef RT_ARCH_AMD64
7326 mov rax, [pvBitmap]
7327 bts [rax], edx
7328# else
7329 mov eax, [pvBitmap]
7330 bts [eax], edx
7331# endif
7332 setc al
7333 and eax, 1
7334 mov [rc.u32], eax
7335 }
7336# endif
7337
7338# else
7339 int32_t offBitmap = iBit / 32;
7340 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7341 rc.u32 = RT_LE2H_U32(ASMAtomicUoOrExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7342 >> (iBit & 31);
7343 rc.u32 &= 1;
7344# endif
7345 return rc.f;
7346}
7347#endif
7348
7349
7350/**
7351 * Atomically tests and sets a bit in a bitmap, ordered.
7352 *
7353 * @returns true if the bit was set.
7354 * @returns false if the bit was clear.
7355 *
7356 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7357 * aligned, otherwise the memory access isn't atomic!
7358 * @param iBit The bit to set.
7359 *
7360 * @remarks x86: Requires a 386 or later.
7361 */
7362#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7363RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7364#else
7365DECLINLINE(bool) ASMAtomicBitTestAndSet(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7366{
7367 union { bool f; uint32_t u32; uint8_t u8; } rc;
7368 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7369# if RT_INLINE_ASM_USES_INTRIN
7370 rc.u8 = _interlockedbittestandset((long RT_FAR *)pvBitmap, iBit);
7371# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7372# if RT_INLINE_ASM_GNU_STYLE
7373 __asm__ __volatile__("lock; btsl %2, %1\n\t"
7374 "setc %b0\n\t"
7375 "andl $1, %0\n\t"
7376 : "=q" (rc.u32)
7377 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7378 : "Ir" (iBit)
7379 , "m" (*(volatile long RT_FAR *)pvBitmap)
7380 : "memory"
7381 , "cc");
7382# else
7383 __asm
7384 {
7385 mov edx, [iBit]
7386# ifdef RT_ARCH_AMD64
7387 mov rax, [pvBitmap]
7388 lock bts [rax], edx
7389# else
7390 mov eax, [pvBitmap]
7391 lock bts [eax], edx
7392# endif
7393 setc al
7394 and eax, 1
7395 mov [rc.u32], eax
7396 }
7397# endif
7398
7399# else
7400 rc.u32 = RT_LE2H_U32(ASMAtomicOrExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7401 >> (iBit & 31);
7402 rc.u32 &= 1;
7403# endif
7404 return rc.f;
7405}
7406#endif
7407
7408
7409/**
7410 * Tests and clears a bit in a bitmap.
7411 *
7412 * @returns true if the bit was set.
7413 * @returns false if the bit was clear.
7414 *
7415 * @param pvBitmap Pointer to the bitmap (little endian).
7416 * @param iBit The bit to test and clear.
7417 *
7418 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7419 * However, doing so will yield better performance as well as avoiding
7420 * traps accessing the last bits in the bitmap.
7421 */
7422#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7423RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7424#else
7425DECLINLINE(bool) ASMBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7426{
7427 union { bool f; uint32_t u32; uint8_t u8; } rc;
7428# if RT_INLINE_ASM_USES_INTRIN
7429 rc.u8 = _bittestandreset((long RT_FAR *)pvBitmap, iBit);
7430
7431# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7432# if RT_INLINE_ASM_GNU_STYLE
7433 __asm__ __volatile__("btrl %2, %1\n\t"
7434 "setc %b0\n\t"
7435 "andl $1, %0\n\t"
7436 : "=q" (rc.u32)
7437 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7438 : "Ir" (iBit)
7439 , "m" (*(volatile long RT_FAR *)pvBitmap)
7440 : "memory"
7441 , "cc");
7442# else
7443 __asm
7444 {
7445 mov edx, [iBit]
7446# ifdef RT_ARCH_AMD64
7447 mov rax, [pvBitmap]
7448 btr [rax], edx
7449# else
7450 mov eax, [pvBitmap]
7451 btr [eax], edx
7452# endif
7453 setc al
7454 and eax, 1
7455 mov [rc.u32], eax
7456 }
7457# endif
7458
7459# else
7460 int32_t offBitmap = iBit / 32;
7461 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7462 rc.u32 = RT_LE2H_U32(ASMAtomicUoAndExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7463 >> (iBit & 31);
7464 rc.u32 &= 1;
7465# endif
7466 return rc.f;
7467}
7468#endif
7469
7470
7471/**
7472 * Atomically tests and clears a bit in a bitmap, ordered.
7473 *
7474 * @returns true if the bit was set.
7475 * @returns false if the bit was clear.
7476 *
7477 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7478 * aligned, otherwise the memory access isn't atomic!
7479 * @param iBit The bit to test and clear.
7480 *
7481 * @remarks No memory barrier, take care on smp.
7482 * @remarks x86: Requires a 386 or later.
7483 */
7484#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7485RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7486#else
7487DECLINLINE(bool) ASMAtomicBitTestAndClear(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7488{
7489 union { bool f; uint32_t u32; uint8_t u8; } rc;
7490 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7491# if RT_INLINE_ASM_USES_INTRIN
7492 rc.u8 = _interlockedbittestandreset((long RT_FAR *)pvBitmap, iBit);
7493
7494# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7495# if RT_INLINE_ASM_GNU_STYLE
7496 __asm__ __volatile__("lock; btrl %2, %1\n\t"
7497 "setc %b0\n\t"
7498 "andl $1, %0\n\t"
7499 : "=q" (rc.u32)
7500 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7501 : "Ir" (iBit)
7502 , "m" (*(volatile long RT_FAR *)pvBitmap)
7503 : "memory"
7504 , "cc");
7505# else
7506 __asm
7507 {
7508 mov edx, [iBit]
7509# ifdef RT_ARCH_AMD64
7510 mov rax, [pvBitmap]
7511 lock btr [rax], edx
7512# else
7513 mov eax, [pvBitmap]
7514 lock btr [eax], edx
7515# endif
7516 setc al
7517 and eax, 1
7518 mov [rc.u32], eax
7519 }
7520# endif
7521
7522# else
7523 rc.u32 = RT_LE2H_U32(ASMAtomicAndExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_H2LE_U32(~RT_BIT_32(iBit & 31))))
7524 >> (iBit & 31);
7525 rc.u32 &= 1;
7526# endif
7527 return rc.f;
7528}
7529#endif
7530
7531
7532/**
7533 * Tests and toggles a bit in a bitmap.
7534 *
7535 * @returns true if the bit was set.
7536 * @returns false if the bit was clear.
7537 *
7538 * @param pvBitmap Pointer to the bitmap (little endian).
7539 * @param iBit The bit to test and toggle.
7540 *
7541 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7542 * However, doing so will yield better performance as well as avoiding
7543 * traps accessing the last bits in the bitmap.
7544 */
7545#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7546RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7547#else
7548DECLINLINE(bool) ASMBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7549{
7550 union { bool f; uint32_t u32; uint8_t u8; } rc;
7551# if RT_INLINE_ASM_USES_INTRIN
7552 rc.u8 = _bittestandcomplement((long RT_FAR *)pvBitmap, iBit);
7553
7554# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7555# if RT_INLINE_ASM_GNU_STYLE
7556 __asm__ __volatile__("btcl %2, %1\n\t"
7557 "setc %b0\n\t"
7558 "andl $1, %0\n\t"
7559 : "=q" (rc.u32)
7560 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7561 : "Ir" (iBit)
7562 , "m" (*(volatile long RT_FAR *)pvBitmap)
7563 : "memory"
7564 , "cc");
7565# else
7566 __asm
7567 {
7568 mov edx, [iBit]
7569# ifdef RT_ARCH_AMD64
7570 mov rax, [pvBitmap]
7571 btc [rax], edx
7572# else
7573 mov eax, [pvBitmap]
7574 btc [eax], edx
7575# endif
7576 setc al
7577 and eax, 1
7578 mov [rc.u32], eax
7579 }
7580# endif
7581
7582# else
7583 int32_t offBitmap = iBit / 32;
7584 AssertStmt(!((uintptr_t)pvBitmap & 3), offBitmap += (uintptr_t)pvBitmap & 3; iBit += ((uintptr_t)pvBitmap & 3) * 8);
7585 rc.u32 = RT_LE2H_U32(ASMAtomicUoXorExU32(&((uint32_t volatile *)pvBitmap)[offBitmap], RT_H2LE_U32(RT_BIT_32(iBit & 31))))
7586 >> (iBit & 31);
7587 rc.u32 &= 1;
7588# endif
7589 return rc.f;
7590}
7591#endif
7592
7593
7594/**
7595 * Atomically tests and toggles a bit in a bitmap, ordered.
7596 *
7597 * @returns true if the bit was set.
7598 * @returns false if the bit was clear.
7599 *
7600 * @param pvBitmap Pointer to the bitmap (little endian). Must be 32-bit
7601 * aligned, otherwise the memory access isn't atomic!
7602 * @param iBit The bit to test and toggle.
7603 *
7604 * @remarks x86: Requires a 386 or later.
7605 */
7606#if RT_INLINE_ASM_EXTERNAL_TMP_ARM
7607RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7608#else
7609DECLINLINE(bool) ASMAtomicBitTestAndToggle(volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7610{
7611 union { bool f; uint32_t u32; uint8_t u8; } rc;
7612 AssertMsg(!((uintptr_t)pvBitmap & 3), ("address %p not 32-bit aligned", pvBitmap));
7613# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7614# if RT_INLINE_ASM_GNU_STYLE
7615 __asm__ __volatile__("lock; btcl %2, %1\n\t"
7616 "setc %b0\n\t"
7617 "andl $1, %0\n\t"
7618 : "=q" (rc.u32)
7619 , "=m" (*(volatile long RT_FAR *)pvBitmap)
7620 : "Ir" (iBit)
7621 , "m" (*(volatile long RT_FAR *)pvBitmap)
7622 : "memory"
7623 , "cc");
7624# else
7625 __asm
7626 {
7627 mov edx, [iBit]
7628# ifdef RT_ARCH_AMD64
7629 mov rax, [pvBitmap]
7630 lock btc [rax], edx
7631# else
7632 mov eax, [pvBitmap]
7633 lock btc [eax], edx
7634# endif
7635 setc al
7636 and eax, 1
7637 mov [rc.u32], eax
7638 }
7639# endif
7640
7641# else
7642 rc.u32 = RT_H2LE_U32(ASMAtomicXorExU32(&((uint32_t volatile *)pvBitmap)[iBit / 32], RT_LE2H_U32(RT_BIT_32(iBit & 31))))
7643 >> (iBit & 31);
7644 rc.u32 &= 1;
7645# endif
7646 return rc.f;
7647}
7648#endif
7649
7650
7651/**
7652 * Tests if a bit in a bitmap is set.
7653 *
7654 * @returns true if the bit is set.
7655 * @returns false if the bit is clear.
7656 *
7657 * @param pvBitmap Pointer to the bitmap (little endian).
7658 * @param iBit The bit to test.
7659 *
7660 * @remarks The 32-bit aligning of pvBitmap is not a strict requirement.
7661 * However, doing so will yield better performance as well as avoiding
7662 * traps accessing the last bits in the bitmap.
7663 */
7664#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
7665RT_ASM_DECL_PRAGMA_WATCOM(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_PROTO;
7666#else
7667DECLINLINE(bool) ASMBitTest(const volatile void RT_FAR *pvBitmap, int32_t iBit) RT_NOTHROW_DEF
7668{
7669 union { bool f; uint32_t u32; uint8_t u8; } rc;
7670# if RT_INLINE_ASM_USES_INTRIN
7671 rc.u32 = _bittest((long *)pvBitmap, iBit);
7672
7673# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
7674# if RT_INLINE_ASM_GNU_STYLE
7675
7676 __asm__ __volatile__("btl %2, %1\n\t"
7677 "setc %b0\n\t"
7678 "andl $1, %0\n\t"
7679 : "=q" (rc.u32)
7680 : "m" (*(const volatile long RT_FAR *)pvBitmap)
7681 , "Ir" (iBit)
7682 : "memory"
7683 , "cc");
7684# else
7685 __asm
7686 {
7687 mov edx, [iBit]
7688# ifdef RT_ARCH_AMD64
7689 mov rax, [pvBitmap]
7690 bt [rax], edx
7691# else
7692 mov eax, [pvBitmap]
7693 bt [eax], edx
7694# endif
7695 setc al
7696 and eax, 1
7697 mov [rc.u32], eax
7698 }
7699# endif
7700
7701# else
7702 int32_t offBitmap = iBit / 32;
7703 AssertRelease(!((uintptr_t)pvBitmap & (sizeof(uint32_t) - 1)));
7704 rc.u32 = RT_LE2H_U32(ASMAtomicUoReadU32(&((uint32_t volatile *)pvBitmap)[offBitmap])) >> (iBit & 31);
7705 rc.u32 &= 1;
7706# endif
7707 return rc.f;
7708}
7709#endif
7710
7711
7712#ifdef IPRT_INCLUDED_asm_mem_h
7713
7714/**
7715 * Clears a bit range within a bitmap.
7716 *
7717 * @param pvBitmap Pointer to the bitmap (little endian).
7718 * @param iBitStart The First bit to clear.
7719 * @param iBitEnd The first bit not to clear.
7720 */
7721DECLINLINE(void) ASMBitClearRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7722{
7723 if (iBitStart < iBitEnd)
7724 {
7725 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7726 size_t iStart = iBitStart & ~(size_t)31;
7727 size_t iEnd = iBitEnd & ~(size_t)31;
7728 if (iStart == iEnd)
7729 *pu32 &= RT_H2LE_U32(((UINT32_C(1) << (iBitStart & 31)) - 1) | ~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7730 else
7731 {
7732 /* bits in first dword. */
7733 if (iBitStart & 31)
7734 {
7735 *pu32 &= RT_H2LE_U32((UINT32_C(1) << (iBitStart & 31)) - 1);
7736 pu32++;
7737 iBitStart = iStart + 32;
7738 }
7739
7740 /* whole dwords. */
7741 if (iBitStart != iEnd)
7742 ASMMemZero32(pu32, (iEnd - iBitStart) >> 3);
7743
7744 /* bits in last dword. */
7745 if (iBitEnd & 31)
7746 {
7747 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7748 *pu32 &= RT_H2LE_U32(~((UINT32_C(1) << (iBitEnd & 31)) - 1));
7749 }
7750 }
7751 }
7752}
7753
7754
7755/**
7756 * Sets a bit range within a bitmap.
7757 *
7758 * @param pvBitmap Pointer to the bitmap (little endian).
7759 * @param iBitStart The First bit to set.
7760 * @param iBitEnd The first bit not to set.
7761 */
7762DECLINLINE(void) ASMBitSetRange(volatile void RT_FAR *pvBitmap, size_t iBitStart, size_t iBitEnd) RT_NOTHROW_DEF
7763{
7764 if (iBitStart < iBitEnd)
7765 {
7766 uint32_t volatile RT_FAR *pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitStart >> 5);
7767 size_t iStart = iBitStart & ~(size_t)31;
7768 size_t iEnd = iBitEnd & ~(size_t)31;
7769 if (iStart == iEnd)
7770 *pu32 |= RT_H2LE_U32(((UINT32_C(1) << (iBitEnd - iBitStart)) - 1) << (iBitStart & 31));
7771 else
7772 {
7773 /* bits in first dword. */
7774 if (iBitStart & 31)
7775 {
7776 *pu32 |= RT_H2LE_U32(~((UINT32_C(1) << (iBitStart & 31)) - 1));
7777 pu32++;
7778 iBitStart = iStart + 32;
7779 }
7780
7781 /* whole dword. */
7782 if (iBitStart != iEnd)
7783 ASMMemFill32(pu32, (iEnd - iBitStart) >> 3, ~UINT32_C(0));
7784
7785 /* bits in last dword. */
7786 if (iBitEnd & 31)
7787 {
7788 pu32 = (volatile uint32_t RT_FAR *)pvBitmap + (iBitEnd >> 5);
7789 *pu32 |= RT_H2LE_U32((UINT32_C(1) << (iBitEnd & 31)) - 1);
7790 }
7791 }
7792 }
7793}
7794
7795#endif /* IPRT_INCLUDED_asm_mem_h */
7796
7797/**
7798 * Finds the first clear bit in a bitmap.
7799 *
7800 * @returns Index of the first zero bit.
7801 * @returns -1 if no clear bit was found.
7802 * @param pvBitmap Pointer to the bitmap (little endian).
7803 * @param cBits The number of bits in the bitmap. Multiple of 32.
7804 */
7805#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7806RT_DECL_ASM(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7807#else
7808DECLINLINE(int32_t) ASMBitFirstClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7809{
7810 if (cBits)
7811 {
7812 int32_t iBit;
7813# if RT_INLINE_ASM_GNU_STYLE
7814 RTCCUINTREG uEAX, uECX, uEDI;
7815 cBits = RT_ALIGN_32(cBits, 32);
7816 __asm__ __volatile__("repe; scasl\n\t"
7817 "je 1f\n\t"
7818# ifdef RT_ARCH_AMD64
7819 "lea -4(%%rdi), %%rdi\n\t"
7820 "xorl (%%rdi), %%eax\n\t"
7821 "subq %5, %%rdi\n\t"
7822# else
7823 "lea -4(%%edi), %%edi\n\t"
7824 "xorl (%%edi), %%eax\n\t"
7825 "subl %5, %%edi\n\t"
7826# endif
7827 "shll $3, %%edi\n\t"
7828 "bsfl %%eax, %%edx\n\t"
7829 "addl %%edi, %%edx\n\t"
7830 "1:\t\n"
7831 : "=d" (iBit)
7832 , "=&c" (uECX)
7833 , "=&D" (uEDI)
7834 , "=&a" (uEAX)
7835 : "0" (0xffffffff)
7836 , "mr" (pvBitmap)
7837 , "1" (cBits >> 5)
7838 , "2" (pvBitmap)
7839 , "3" (0xffffffff)
7840 : "cc");
7841# else
7842 cBits = RT_ALIGN_32(cBits, 32);
7843 __asm
7844 {
7845# ifdef RT_ARCH_AMD64
7846 mov rdi, [pvBitmap]
7847 mov rbx, rdi
7848# else
7849 mov edi, [pvBitmap]
7850 mov ebx, edi
7851# endif
7852 mov edx, 0ffffffffh
7853 mov eax, edx
7854 mov ecx, [cBits]
7855 shr ecx, 5
7856 repe scasd
7857 je done
7858
7859# ifdef RT_ARCH_AMD64
7860 lea rdi, [rdi - 4]
7861 xor eax, [rdi]
7862 sub rdi, rbx
7863# else
7864 lea edi, [edi - 4]
7865 xor eax, [edi]
7866 sub edi, ebx
7867# endif
7868 shl edi, 3
7869 bsf edx, eax
7870 add edx, edi
7871 done:
7872 mov [iBit], edx
7873 }
7874# endif
7875 return iBit;
7876 }
7877 return -1;
7878}
7879#endif
7880
7881
7882/**
7883 * Finds the next clear bit in a bitmap.
7884 *
7885 * @returns Index of the first zero bit.
7886 * @returns -1 if no clear bit was found.
7887 * @param pvBitmap Pointer to the bitmap (little endian).
7888 * @param cBits The number of bits in the bitmap. Multiple of 32.
7889 * @param iBitPrev The bit returned from the last search.
7890 * The search will start at iBitPrev + 1.
7891 */
7892#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7893RT_DECL_ASM(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
7894#else
7895DECLINLINE(int) ASMBitNextClear(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
7896{
7897 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
7898 int iBit = ++iBitPrev & 31;
7899 if (iBit)
7900 {
7901 /*
7902 * Inspect the 32-bit word containing the unaligned bit.
7903 */
7904 uint32_t u32 = ~pau32Bitmap[iBitPrev / 32] >> iBit;
7905
7906# if RT_INLINE_ASM_USES_INTRIN
7907 unsigned long ulBit = 0;
7908 if (_BitScanForward(&ulBit, u32))
7909 return ulBit + iBitPrev;
7910# else
7911# if RT_INLINE_ASM_GNU_STYLE
7912 __asm__ __volatile__("bsf %1, %0\n\t"
7913 "jnz 1f\n\t"
7914 "movl $-1, %0\n\t" /** @todo use conditional move for 64-bit? */
7915 "1:\n\t"
7916 : "=r" (iBit)
7917 : "r" (u32)
7918 : "cc");
7919# else
7920 __asm
7921 {
7922 mov edx, [u32]
7923 bsf eax, edx
7924 jnz done
7925 mov eax, 0ffffffffh
7926 done:
7927 mov [iBit], eax
7928 }
7929# endif
7930 if (iBit >= 0)
7931 return iBit + (int)iBitPrev;
7932# endif
7933
7934 /*
7935 * Skip ahead and see if there is anything left to search.
7936 */
7937 iBitPrev |= 31;
7938 iBitPrev++;
7939 if (cBits <= (uint32_t)iBitPrev)
7940 return -1;
7941 }
7942
7943 /*
7944 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
7945 */
7946 iBit = ASMBitFirstClear(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
7947 if (iBit >= 0)
7948 iBit += iBitPrev;
7949 return iBit;
7950}
7951#endif
7952
7953
7954/**
7955 * Finds the first set bit in a bitmap.
7956 *
7957 * @returns Index of the first set bit.
7958 * @returns -1 if no clear bit was found.
7959 * @param pvBitmap Pointer to the bitmap (little endian).
7960 * @param cBits The number of bits in the bitmap. Multiple of 32.
7961 */
7962#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
7963RT_DECL_ASM(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_PROTO;
7964#else
7965DECLINLINE(int32_t) ASMBitFirstSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits) RT_NOTHROW_DEF
7966{
7967 if (cBits)
7968 {
7969 int32_t iBit;
7970# if RT_INLINE_ASM_GNU_STYLE
7971 RTCCUINTREG uEAX, uECX, uEDI;
7972 cBits = RT_ALIGN_32(cBits, 32);
7973 __asm__ __volatile__("repe; scasl\n\t"
7974 "je 1f\n\t"
7975# ifdef RT_ARCH_AMD64
7976 "lea -4(%%rdi), %%rdi\n\t"
7977 "movl (%%rdi), %%eax\n\t"
7978 "subq %5, %%rdi\n\t"
7979# else
7980 "lea -4(%%edi), %%edi\n\t"
7981 "movl (%%edi), %%eax\n\t"
7982 "subl %5, %%edi\n\t"
7983# endif
7984 "shll $3, %%edi\n\t"
7985 "bsfl %%eax, %%edx\n\t"
7986 "addl %%edi, %%edx\n\t"
7987 "1:\t\n"
7988 : "=d" (iBit)
7989 , "=&c" (uECX)
7990 , "=&D" (uEDI)
7991 , "=&a" (uEAX)
7992 : "0" (0xffffffff)
7993 , "mr" (pvBitmap)
7994 , "1" (cBits >> 5)
7995 , "2" (pvBitmap)
7996 , "3" (0)
7997 : "cc");
7998# else
7999 cBits = RT_ALIGN_32(cBits, 32);
8000 __asm
8001 {
8002# ifdef RT_ARCH_AMD64
8003 mov rdi, [pvBitmap]
8004 mov rbx, rdi
8005# else
8006 mov edi, [pvBitmap]
8007 mov ebx, edi
8008# endif
8009 mov edx, 0ffffffffh
8010 xor eax, eax
8011 mov ecx, [cBits]
8012 shr ecx, 5
8013 repe scasd
8014 je done
8015# ifdef RT_ARCH_AMD64
8016 lea rdi, [rdi - 4]
8017 mov eax, [rdi]
8018 sub rdi, rbx
8019# else
8020 lea edi, [edi - 4]
8021 mov eax, [edi]
8022 sub edi, ebx
8023# endif
8024 shl edi, 3
8025 bsf edx, eax
8026 add edx, edi
8027 done:
8028 mov [iBit], edx
8029 }
8030# endif
8031 return iBit;
8032 }
8033 return -1;
8034}
8035#endif
8036
8037
8038/**
8039 * Finds the next set bit in a bitmap.
8040 *
8041 * @returns Index of the next set bit.
8042 * @returns -1 if no set bit was found.
8043 * @param pvBitmap Pointer to the bitmap (little endian).
8044 * @param cBits The number of bits in the bitmap. Multiple of 32.
8045 * @param iBitPrev The bit returned from the last search.
8046 * The search will start at iBitPrev + 1.
8047 */
8048#if RT_INLINE_ASM_EXTERNAL || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
8049RT_DECL_ASM(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_PROTO;
8050#else
8051DECLINLINE(int) ASMBitNextSet(const volatile void RT_FAR *pvBitmap, uint32_t cBits, uint32_t iBitPrev) RT_NOTHROW_DEF
8052{
8053 const volatile uint32_t RT_FAR *pau32Bitmap = (const volatile uint32_t RT_FAR *)pvBitmap;
8054 int iBit = ++iBitPrev & 31;
8055 if (iBit)
8056 {
8057 /*
8058 * Inspect the 32-bit word containing the unaligned bit.
8059 */
8060 uint32_t u32 = pau32Bitmap[iBitPrev / 32] >> iBit;
8061
8062# if RT_INLINE_ASM_USES_INTRIN
8063 unsigned long ulBit = 0;
8064 if (_BitScanForward(&ulBit, u32))
8065 return ulBit + iBitPrev;
8066# else
8067# if RT_INLINE_ASM_GNU_STYLE
8068 __asm__ __volatile__("bsf %1, %0\n\t"
8069 "jnz 1f\n\t" /** @todo use conditional move for 64-bit? */
8070 "movl $-1, %0\n\t"
8071 "1:\n\t"
8072 : "=r" (iBit)
8073 : "r" (u32)
8074 : "cc");
8075# else
8076 __asm
8077 {
8078 mov edx, [u32]
8079 bsf eax, edx
8080 jnz done
8081 mov eax, 0ffffffffh
8082 done:
8083 mov [iBit], eax
8084 }
8085# endif
8086 if (iBit >= 0)
8087 return iBit + (int)iBitPrev;
8088# endif
8089
8090 /*
8091 * Skip ahead and see if there is anything left to search.
8092 */
8093 iBitPrev |= 31;
8094 iBitPrev++;
8095 if (cBits <= (uint32_t)iBitPrev)
8096 return -1;
8097 }
8098
8099 /*
8100 * 32-bit aligned search, let ASMBitFirstClear do the dirty work.
8101 */
8102 iBit = ASMBitFirstSet(&pau32Bitmap[iBitPrev / 32], cBits - iBitPrev);
8103 if (iBit >= 0)
8104 iBit += iBitPrev;
8105 return iBit;
8106}
8107#endif
8108
8109/** @} */
8110
8111
8112/** @defgroup grp_inline_bits Bitmap Operations
8113 * @{
8114 */
8115
8116/**
8117 * Finds the first bit which is set in the given 32-bit integer.
8118 * Bits are numbered from 1 (least significant) to 32.
8119 *
8120 * @returns index [1..32] of the first set bit.
8121 * @returns 0 if all bits are cleared.
8122 * @param u32 Integer to search for set bits.
8123 * @remarks Similar to ffs() in BSD.
8124 */
8125#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8126RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8127#else
8128DECLINLINE(unsigned) ASMBitFirstSetU32(uint32_t u32) RT_NOTHROW_DEF
8129{
8130# if RT_INLINE_ASM_USES_INTRIN
8131 unsigned long iBit;
8132 if (_BitScanForward(&iBit, u32))
8133 iBit++;
8134 else
8135 iBit = 0;
8136
8137# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8138# if RT_INLINE_ASM_GNU_STYLE
8139 uint32_t iBit;
8140 __asm__ __volatile__("bsf %1, %0\n\t"
8141 "jnz 1f\n\t"
8142 "xorl %0, %0\n\t"
8143 "jmp 2f\n"
8144 "1:\n\t"
8145 "incl %0\n"
8146 "2:\n\t"
8147 : "=r" (iBit)
8148 : "rm" (u32)
8149 : "cc");
8150# else
8151 uint32_t iBit;
8152 _asm
8153 {
8154 bsf eax, [u32]
8155 jnz found
8156 xor eax, eax
8157 jmp done
8158 found:
8159 inc eax
8160 done:
8161 mov [iBit], eax
8162 }
8163# endif
8164
8165# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8166 /*
8167 * Using the "count leading zeros (clz)" instruction here because there
8168 * is no dedicated instruction to get the first set bit.
8169 * Need to reverse the bits in the value with "rbit" first because
8170 * "clz" starts counting from the most significant bit.
8171 */
8172 uint32_t iBit;
8173 __asm__ __volatile__(
8174# if defined(RT_ARCH_ARM64)
8175 "rbit %w[uVal], %w[uVal]\n\t"
8176 "clz %w[iBit], %w[uVal]\n\t"
8177# else
8178 "rbit %[uVal], %[uVal]\n\t"
8179 "clz %[iBit], %[uVal]\n\t"
8180# endif
8181 : [uVal] "=r" (u32)
8182 , [iBit] "=r" (iBit)
8183 : "[uVal]" (u32));
8184 if (iBit != 32)
8185 iBit++;
8186 else
8187 iBit = 0; /* No bit set. */
8188
8189# else
8190# error "Port me"
8191# endif
8192 return iBit;
8193}
8194#endif
8195
8196
8197/**
8198 * Finds the first bit which is set in the given 32-bit integer.
8199 * Bits are numbered from 1 (least significant) to 32.
8200 *
8201 * @returns index [1..32] of the first set bit.
8202 * @returns 0 if all bits are cleared.
8203 * @param i32 Integer to search for set bits.
8204 * @remark Similar to ffs() in BSD.
8205 */
8206DECLINLINE(unsigned) ASMBitFirstSetS32(int32_t i32) RT_NOTHROW_DEF
8207{
8208 return ASMBitFirstSetU32((uint32_t)i32);
8209}
8210
8211
8212/**
8213 * Finds the first bit which is set in the given 64-bit integer.
8214 *
8215 * Bits are numbered from 1 (least significant) to 64.
8216 *
8217 * @returns index [1..64] of the first set bit.
8218 * @returns 0 if all bits are cleared.
8219 * @param u64 Integer to search for set bits.
8220 * @remarks Similar to ffs() in BSD.
8221 */
8222#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8223RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8224#else
8225DECLINLINE(unsigned) ASMBitFirstSetU64(uint64_t u64) RT_NOTHROW_DEF
8226{
8227# if RT_INLINE_ASM_USES_INTRIN
8228 unsigned long iBit;
8229# if ARCH_BITS == 64
8230 if (_BitScanForward64(&iBit, u64))
8231 iBit++;
8232 else
8233 iBit = 0;
8234# else
8235 if (_BitScanForward(&iBit, (uint32_t)u64))
8236 iBit++;
8237 else if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8238 iBit += 33;
8239 else
8240 iBit = 0;
8241# endif
8242
8243# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8244 uint64_t iBit;
8245 __asm__ __volatile__("bsfq %1, %0\n\t"
8246 "jnz 1f\n\t"
8247 "xorl %k0, %k0\n\t"
8248 "jmp 2f\n"
8249 "1:\n\t"
8250 "incl %k0\n"
8251 "2:\n\t"
8252 : "=r" (iBit)
8253 : "rm" (u64)
8254 : "cc");
8255
8256# elif defined(RT_ARCH_ARM64)
8257 uint64_t iBit;
8258 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8259 "clz %[iBit], %[uVal]\n\t"
8260 : [uVal] "=r" (u64)
8261 , [iBit] "=r" (iBit)
8262 : "[uVal]" (u64));
8263 if (iBit != 64)
8264 iBit++;
8265 else
8266 iBit = 0; /* No bit set. */
8267
8268# else
8269 unsigned iBit = ASMBitFirstSetU32((uint32_t)u64);
8270 if (!iBit)
8271 {
8272 iBit = ASMBitFirstSetU32((uint32_t)(u64 >> 32));
8273 if (iBit)
8274 iBit += 32;
8275 }
8276# endif
8277 return (unsigned)iBit;
8278}
8279#endif
8280
8281
8282/**
8283 * Finds the first bit which is set in the given 16-bit integer.
8284 *
8285 * Bits are numbered from 1 (least significant) to 16.
8286 *
8287 * @returns index [1..16] of the first set bit.
8288 * @returns 0 if all bits are cleared.
8289 * @param u16 Integer to search for set bits.
8290 * @remarks For 16-bit bs3kit code.
8291 */
8292#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8293RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8294#else
8295DECLINLINE(unsigned) ASMBitFirstSetU16(uint16_t u16) RT_NOTHROW_DEF
8296{
8297 return ASMBitFirstSetU32((uint32_t)u16);
8298}
8299#endif
8300
8301
8302/**
8303 * Finds the last bit which is set in the given 32-bit integer.
8304 * Bits are numbered from 1 (least significant) to 32.
8305 *
8306 * @returns index [1..32] of the last set bit.
8307 * @returns 0 if all bits are cleared.
8308 * @param u32 Integer to search for set bits.
8309 * @remark Similar to fls() in BSD.
8310 */
8311#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8312RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_PROTO;
8313#else
8314DECLINLINE(unsigned) ASMBitLastSetU32(uint32_t u32) RT_NOTHROW_DEF
8315{
8316# if RT_INLINE_ASM_USES_INTRIN
8317 unsigned long iBit;
8318 if (_BitScanReverse(&iBit, u32))
8319 iBit++;
8320 else
8321 iBit = 0;
8322
8323# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8324# if RT_INLINE_ASM_GNU_STYLE
8325 uint32_t iBit;
8326 __asm__ __volatile__("bsrl %1, %0\n\t"
8327 "jnz 1f\n\t"
8328 "xorl %0, %0\n\t"
8329 "jmp 2f\n"
8330 "1:\n\t"
8331 "incl %0\n"
8332 "2:\n\t"
8333 : "=r" (iBit)
8334 : "rm" (u32)
8335 : "cc");
8336# else
8337 uint32_t iBit;
8338 _asm
8339 {
8340 bsr eax, [u32]
8341 jnz found
8342 xor eax, eax
8343 jmp done
8344 found:
8345 inc eax
8346 done:
8347 mov [iBit], eax
8348 }
8349# endif
8350
8351# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8352 uint32_t iBit;
8353 __asm__ __volatile__(
8354# if defined(RT_ARCH_ARM64)
8355 "clz %w[iBit], %w[uVal]\n\t"
8356# else
8357 "clz %[iBit], %[uVal]\n\t"
8358# endif
8359 : [iBit] "=r" (iBit)
8360 : [uVal] "r" (u32));
8361 iBit = 32 - iBit;
8362
8363# else
8364# error "Port me"
8365# endif
8366 return iBit;
8367}
8368#endif
8369
8370
8371/**
8372 * Finds the last bit which is set in the given 32-bit integer.
8373 * Bits are numbered from 1 (least significant) to 32.
8374 *
8375 * @returns index [1..32] of the last set bit.
8376 * @returns 0 if all bits are cleared.
8377 * @param i32 Integer to search for set bits.
8378 * @remark Similar to fls() in BSD.
8379 */
8380DECLINLINE(unsigned) ASMBitLastSetS32(int32_t i32) RT_NOTHROW_DEF
8381{
8382 return ASMBitLastSetU32((uint32_t)i32);
8383}
8384
8385
8386/**
8387 * Finds the last bit which is set in the given 64-bit integer.
8388 *
8389 * Bits are numbered from 1 (least significant) to 64.
8390 *
8391 * @returns index [1..64] of the last set bit.
8392 * @returns 0 if all bits are cleared.
8393 * @param u64 Integer to search for set bits.
8394 * @remark Similar to fls() in BSD.
8395 */
8396#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8397RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_PROTO;
8398#else
8399DECLINLINE(unsigned) ASMBitLastSetU64(uint64_t u64) RT_NOTHROW_DEF
8400{
8401# if RT_INLINE_ASM_USES_INTRIN
8402 unsigned long iBit;
8403# if ARCH_BITS == 64
8404 if (_BitScanReverse64(&iBit, u64))
8405 iBit++;
8406 else
8407 iBit = 0;
8408# else
8409 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8410 iBit += 33;
8411 else if (_BitScanReverse(&iBit, (uint32_t)u64))
8412 iBit++;
8413 else
8414 iBit = 0;
8415# endif
8416
8417# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8418 uint64_t iBit;
8419 __asm__ __volatile__("bsrq %1, %0\n\t"
8420 "jnz 1f\n\t"
8421 "xorl %k0, %k0\n\t"
8422 "jmp 2f\n"
8423 "1:\n\t"
8424 "incl %k0\n"
8425 "2:\n\t"
8426 : "=r" (iBit)
8427 : "rm" (u64)
8428 : "cc");
8429
8430# elif defined(RT_ARCH_ARM64)
8431 uint64_t iBit;
8432 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8433 : [iBit] "=r" (iBit)
8434 : [uVal] "r" (u64));
8435 iBit = 64 - iBit;
8436
8437# else
8438 unsigned iBit = ASMBitLastSetU32((uint32_t)(u64 >> 32));
8439 if (iBit)
8440 iBit += 32;
8441 else
8442 iBit = ASMBitLastSetU32((uint32_t)u64);
8443# endif
8444 return (unsigned)iBit;
8445}
8446#endif
8447
8448
8449/**
8450 * Finds the last bit which is set in the given 16-bit integer.
8451 *
8452 * Bits are numbered from 1 (least significant) to 16.
8453 *
8454 * @returns index [1..16] of the last set bit.
8455 * @returns 0 if all bits are cleared.
8456 * @param u16 Integer to search for set bits.
8457 * @remarks For 16-bit bs3kit code.
8458 */
8459#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8460RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_PROTO;
8461#else
8462DECLINLINE(unsigned) ASMBitLastSetU16(uint16_t u16) RT_NOTHROW_DEF
8463{
8464 return ASMBitLastSetU32((uint32_t)u16);
8465}
8466#endif
8467
8468
8469/**
8470 * Count the number of leading zero bits in the given 32-bit integer.
8471 *
8472 * The counting starts with the most significate bit.
8473 *
8474 * @returns Number of most significant zero bits.
8475 * @returns 32 if all bits are cleared.
8476 * @param u32 Integer to consider.
8477 * @remarks Similar to __builtin_clz() in gcc, except defined zero input result.
8478 */
8479#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8480RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8481#else
8482DECLINLINE(unsigned) ASMCountLeadingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8483{
8484# if RT_INLINE_ASM_USES_INTRIN
8485 unsigned long iBit;
8486 if (!_BitScanReverse(&iBit, u32))
8487 return 32;
8488 return 31 - (unsigned)iBit;
8489
8490# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8491 uint32_t iBit;
8492# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 929 vs 237 ps/call */
8493 __asm__ __volatile__("bsrl %1, %0\n\t"
8494 "cmovzl %2, %0\n\t"
8495 : "=&r" (iBit)
8496 : "rm" (u32)
8497 , "rm" ((int32_t)-1)
8498 : "cc");
8499# elif RT_INLINE_ASM_GNU_STYLE
8500 __asm__ __volatile__("bsr %1, %0\n\t"
8501 "jnz 1f\n\t"
8502 "mov $-1, %0\n\t"
8503 "1:\n\t"
8504 : "=r" (iBit)
8505 : "rm" (u32)
8506 : "cc");
8507# else
8508 _asm
8509 {
8510 bsr eax, [u32]
8511 jnz found
8512 mov eax, -1
8513 found:
8514 mov [iBit], eax
8515 }
8516# endif
8517 return 31 - iBit;
8518
8519# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8520 uint32_t iBit;
8521 __asm__ __volatile__(
8522# if defined(RT_ARCH_ARM64)
8523 "clz %w[iBit], %w[uVal]\n\t"
8524# else
8525 "clz %[iBit], %[uVal]\n\t"
8526# endif
8527 : [uVal] "=r" (u32)
8528 , [iBit] "=r" (iBit)
8529 : "[uVal]" (u32));
8530 return iBit;
8531
8532# elif defined(__GNUC__)
8533 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8534 return u32 ? __builtin_clz(u32) : 32;
8535
8536# else
8537# error "Port me"
8538# endif
8539}
8540#endif
8541
8542
8543/**
8544 * Count the number of leading zero bits in the given 64-bit integer.
8545 *
8546 * The counting starts with the most significate bit.
8547 *
8548 * @returns Number of most significant zero bits.
8549 * @returns 64 if all bits are cleared.
8550 * @param u64 Integer to consider.
8551 * @remarks Similar to __builtin_clzl() in gcc, except defined zero input
8552 * result.
8553 */
8554#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8555RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8556#else
8557DECLINLINE(unsigned) ASMCountLeadingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8558{
8559# if RT_INLINE_ASM_USES_INTRIN
8560 unsigned long iBit;
8561# if ARCH_BITS == 64
8562 if (_BitScanReverse64(&iBit, u64))
8563 return 63 - (unsigned)iBit;
8564# else
8565 if (_BitScanReverse(&iBit, (uint32_t)(u64 >> 32)))
8566 return 31 - (unsigned)iBit;
8567 if (_BitScanReverse(&iBit, (uint32_t)u64))
8568 return 63 - (unsigned)iBit;
8569# endif
8570 return 64;
8571
8572# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8573 uint64_t iBit;
8574# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8575 __asm__ __volatile__("bsrq %1, %0\n\t"
8576 "cmovzq %2, %0\n\t"
8577 : "=&r" (iBit)
8578 : "rm" (u64)
8579 , "rm" ((int64_t)-1)
8580 : "cc");
8581# else /* 10980xe benchmark: 262 ps/call */
8582 __asm__ __volatile__("bsrq %1, %0\n\t"
8583 "jnz 1f\n\t"
8584 "mov $-1, %0\n\t"
8585 "1:\n\t"
8586 : "=&r" (iBit)
8587 : "rm" (u64)
8588 : "cc");
8589# endif
8590 return 63 - (unsigned)iBit;
8591
8592# elif defined(RT_ARCH_ARM64)
8593 uint64_t iBit;
8594 __asm__ __volatile__("clz %[iBit], %[uVal]\n\t"
8595 : [uVal] "=r" (u64)
8596 , [iBit] "=r" (iBit)
8597 : "[uVal]" (u64));
8598 return (unsigned)iBit;
8599
8600# elif defined(__GNUC__) && ARCH_BITS == 64
8601 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8602 return u64 ? __builtin_clzl(u64) : 64;
8603
8604# else
8605 unsigned iBit = ASMCountLeadingZerosU32((uint32_t)(u64 >> 32));
8606 if (iBit == 32)
8607 iBit = ASMCountLeadingZerosU32((uint32_t)u64) + 32;
8608 return iBit;
8609# endif
8610}
8611#endif
8612
8613
8614/**
8615 * Count the number of leading zero bits in the given 16-bit integer.
8616 *
8617 * The counting starts with the most significate bit.
8618 *
8619 * @returns Number of most significant zero bits.
8620 * @returns 16 if all bits are cleared.
8621 * @param u16 Integer to consider.
8622 */
8623#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8624RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8625#else
8626DECLINLINE(unsigned) ASMCountLeadingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8627{
8628# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 987 vs 292 ps/call) */
8629 uint16_t iBit;
8630 __asm__ __volatile__("bsrw %1, %0\n\t"
8631 "jnz 1f\n\t"
8632 "mov $-1, %0\n\t"
8633 "1:\n\t"
8634 : "=r" (iBit)
8635 : "rm" (u16)
8636 : "cc");
8637 return 15 - (int16_t)iBit;
8638# else
8639 return ASMCountLeadingZerosU32((uint32_t)u16) - 16;
8640# endif
8641}
8642#endif
8643
8644
8645/**
8646 * Count the number of trailing zero bits in the given 32-bit integer.
8647 *
8648 * The counting starts with the least significate bit, i.e. the zero bit.
8649 *
8650 * @returns Number of least significant zero bits.
8651 * @returns 32 if all bits are cleared.
8652 * @param u32 Integer to consider.
8653 * @remarks Similar to __builtin_ctz() in gcc, except defined zero input result.
8654 */
8655#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8656RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_PROTO;
8657#else
8658DECLINLINE(unsigned) ASMCountTrailingZerosU32(uint32_t u32) RT_NOTHROW_DEF
8659{
8660# if RT_INLINE_ASM_USES_INTRIN
8661 unsigned long iBit;
8662 if (!_BitScanForward(&iBit, u32))
8663 return 32;
8664 return (unsigned)iBit;
8665
8666# elif defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
8667 uint32_t iBit;
8668# if RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64) && 0 /* significantly slower on 10980xe; 932 vs 240 ps/call */
8669 __asm__ __volatile__("bsfl %1, %0\n\t"
8670 "cmovzl %2, %0\n\t"
8671 : "=&r" (iBit)
8672 : "rm" (u32)
8673 , "rm" ((int32_t)32)
8674 : "cc");
8675# elif RT_INLINE_ASM_GNU_STYLE
8676 __asm__ __volatile__("bsfl %1, %0\n\t"
8677 "jnz 1f\n\t"
8678 "mov $32, %0\n\t"
8679 "1:\n\t"
8680 : "=r" (iBit)
8681 : "rm" (u32)
8682 : "cc");
8683# else
8684 _asm
8685 {
8686 bsf eax, [u32]
8687 jnz found
8688 mov eax, 32
8689 found:
8690 mov [iBit], eax
8691 }
8692# endif
8693 return iBit;
8694
8695# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8696 /* Invert the bits and use clz. */
8697 uint32_t iBit;
8698 __asm__ __volatile__(
8699# if defined(RT_ARCH_ARM64)
8700 "rbit %w[uVal], %w[uVal]\n\t"
8701 "clz %w[iBit], %w[uVal]\n\t"
8702# else
8703 "rbit %[uVal], %[uVal]\n\t"
8704 "clz %[iBit], %[uVal]\n\t"
8705# endif
8706 : [uVal] "=r" (u32)
8707 , [iBit] "=r" (iBit)
8708 : "[uVal]" (u32));
8709 return iBit;
8710
8711# elif defined(__GNUC__)
8712 AssertCompile(sizeof(u32) == sizeof(unsigned int));
8713 return u32 ? __builtin_ctz(u32) : 32;
8714
8715# else
8716# error "Port me"
8717# endif
8718}
8719#endif
8720
8721
8722/**
8723 * Count the number of trailing zero bits in the given 64-bit integer.
8724 *
8725 * The counting starts with the least significate bit.
8726 *
8727 * @returns Number of least significant zero bits.
8728 * @returns 64 if all bits are cleared.
8729 * @param u64 Integer to consider.
8730 * @remarks Similar to __builtin_ctzl() in gcc, except defined zero input
8731 * result.
8732 */
8733#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8734RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_PROTO;
8735#else
8736DECLINLINE(unsigned) ASMCountTrailingZerosU64(uint64_t u64) RT_NOTHROW_DEF
8737{
8738# if RT_INLINE_ASM_USES_INTRIN
8739 unsigned long iBit;
8740# if ARCH_BITS == 64
8741 if (_BitScanForward64(&iBit, u64))
8742 return (unsigned)iBit;
8743# else
8744 if (_BitScanForward(&iBit, (uint32_t)u64))
8745 return (unsigned)iBit;
8746 if (_BitScanForward(&iBit, (uint32_t)(u64 >> 32)))
8747 return (unsigned)iBit + 32;
8748# endif
8749 return 64;
8750
8751# elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8752 uint64_t iBit;
8753# if 0 /* 10980xe benchmark: 932 ps/call - the slower variant */
8754 __asm__ __volatile__("bsfq %1, %0\n\t"
8755 "cmovzq %2, %0\n\t"
8756 : "=&r" (iBit)
8757 : "rm" (u64)
8758 , "rm" ((int64_t)64)
8759 : "cc");
8760# else /* 10980xe benchmark: 262 ps/call */
8761 __asm__ __volatile__("bsfq %1, %0\n\t"
8762 "jnz 1f\n\t"
8763 "mov $64, %0\n\t"
8764 "1:\n\t"
8765 : "=&r" (iBit)
8766 : "rm" (u64)
8767 : "cc");
8768# endif
8769 return (unsigned)iBit;
8770
8771# elif defined(RT_ARCH_ARM64)
8772 /* Invert the bits and use clz. */
8773 uint64_t iBit;
8774 __asm__ __volatile__("rbit %[uVal], %[uVal]\n\t"
8775 "clz %[iBit], %[uVal]\n\t"
8776 : [uVal] "=r" (u64)
8777 , [iBit] "=r" (iBit)
8778 : "[uVal]" (u64));
8779 return (unsigned)iBit;
8780
8781# elif defined(__GNUC__) && ARCH_BITS == 64
8782 AssertCompile(sizeof(u64) == sizeof(unsigned long));
8783 return u64 ? __builtin_ctzl(u64) : 64;
8784
8785# else
8786 unsigned iBit = ASMCountTrailingZerosU32((uint32_t)u64);
8787 if (iBit == 32)
8788 iBit = ASMCountTrailingZerosU32((uint32_t)(u64 >> 32)) + 32;
8789 return iBit;
8790# endif
8791}
8792#endif
8793
8794
8795/**
8796 * Count the number of trailing zero bits in the given 16-bit integer.
8797 *
8798 * The counting starts with the most significate bit.
8799 *
8800 * @returns Number of most significant zero bits.
8801 * @returns 16 if all bits are cleared.
8802 * @param u16 Integer to consider.
8803 */
8804#if RT_INLINE_ASM_EXTERNAL_TMP_ARM && !RT_INLINE_ASM_USES_INTRIN
8805RT_ASM_DECL_PRAGMA_WATCOM_386(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_PROTO;
8806#else
8807DECLINLINE(unsigned) ASMCountTrailingZerosU16(uint16_t u16) RT_NOTHROW_DEF
8808{
8809# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)) && 0 /* slower (10980xe: 992 vs 349 ps/call) */
8810 uint16_t iBit;
8811 __asm__ __volatile__("bsfw %1, %0\n\t"
8812 "jnz 1f\n\t"
8813 "mov $16, %0\n\t"
8814 "1:\n\t"
8815 : "=r" (iBit)
8816 : "rm" (u16)
8817 : "cc");
8818 return iBit;
8819# else
8820 return ASMCountTrailingZerosU32((uint32_t)u16 | UINT32_C(0x10000));
8821#endif
8822}
8823#endif
8824
8825
8826/**
8827 * Rotate 32-bit unsigned value to the left by @a cShift.
8828 *
8829 * @returns Rotated value.
8830 * @param u32 The value to rotate.
8831 * @param cShift How many bits to rotate by.
8832 */
8833#ifdef __WATCOMC__
8834RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateLeftU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8835#else
8836DECLINLINE(uint32_t) ASMRotateLeftU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8837{
8838# if RT_INLINE_ASM_USES_INTRIN
8839 return _rotl(u32, cShift);
8840
8841# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8842 __asm__ __volatile__("roll %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8843 return u32;
8844
8845# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8846 __asm__ __volatile__(
8847# if defined(RT_ARCH_ARM64)
8848 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8849# else
8850 "ror %[uRet], %[uVal], %[cShift]\n\t"
8851# endif
8852 : [uRet] "=r" (u32)
8853 : [uVal] "[uRet]" (u32)
8854 , [cShift] "r" (32 - (cShift & 31))); /** @todo there is an immediate form here */
8855 return u32;
8856
8857# else
8858 cShift &= 31;
8859 return (u32 << cShift) | (u32 >> (32 - cShift));
8860# endif
8861}
8862#endif
8863
8864
8865/**
8866 * Rotate 32-bit unsigned value to the right by @a cShift.
8867 *
8868 * @returns Rotated value.
8869 * @param u32 The value to rotate.
8870 * @param cShift How many bits to rotate by.
8871 */
8872#ifdef __WATCOMC__
8873RT_ASM_DECL_PRAGMA_WATCOM(uint32_t) ASMRotateRightU32(uint32_t u32, unsigned cShift) RT_NOTHROW_PROTO;
8874#else
8875DECLINLINE(uint32_t) ASMRotateRightU32(uint32_t u32, uint32_t cShift) RT_NOTHROW_DEF
8876{
8877# if RT_INLINE_ASM_USES_INTRIN
8878 return _rotr(u32, cShift);
8879
8880# elif RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86))
8881 __asm__ __volatile__("rorl %b1, %0" : "=g" (u32) : "Ic" (cShift), "0" (u32) : "cc");
8882 return u32;
8883
8884# elif defined(RT_ARCH_ARM64) || defined(RT_ARCH_ARM32)
8885 __asm__ __volatile__(
8886# if defined(RT_ARCH_ARM64)
8887 "ror %w[uRet], %w[uVal], %w[cShift]\n\t"
8888# else
8889 "ror %[uRet], %[uVal], %[cShift]\n\t"
8890# endif
8891 : [uRet] "=r" (u32)
8892 : [uVal] "[uRet]" (u32)
8893 , [cShift] "r" (cShift & 31)); /** @todo there is an immediate form here */
8894 return u32;
8895
8896# else
8897 cShift &= 31;
8898 return (u32 >> cShift) | (u32 << (32 - cShift));
8899# endif
8900}
8901#endif
8902
8903
8904/**
8905 * Rotate 64-bit unsigned value to the left by @a cShift.
8906 *
8907 * @returns Rotated value.
8908 * @param u64 The value to rotate.
8909 * @param cShift How many bits to rotate by.
8910 */
8911DECLINLINE(uint64_t) ASMRotateLeftU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8912{
8913#if RT_INLINE_ASM_USES_INTRIN
8914 return _rotl64(u64, cShift);
8915
8916#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8917 __asm__ __volatile__("rolq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8918 return u64;
8919
8920#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8921 uint32_t uSpill;
8922 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8923 "jz 1f\n\t"
8924 "xchgl %%eax, %%edx\n\t"
8925 "1:\n\t"
8926 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8927 "jz 2f\n\t"
8928 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8929 "shldl %%cl,%%eax,%%edx\n\t" /* shift the hi value left, feeding MSBits from the low value. */
8930 "shldl %%cl,%2,%%eax\n\t" /* shift the lo value left, feeding MSBits from the saved hi value. */
8931 "2:\n\t" /* } */
8932 : "=A" (u64)
8933 , "=c" (cShift)
8934 , "=r" (uSpill)
8935 : "0" (u64)
8936 , "1" (cShift)
8937 : "cc");
8938 return u64;
8939
8940# elif defined(RT_ARCH_ARM64)
8941 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8942 : [uRet] "=r" (u64)
8943 : [uVal] "[uRet]" (u64)
8944 , [cShift] "r" ((uint64_t)(64 - (cShift & 63)))); /** @todo there is an immediate form here */
8945 return u64;
8946
8947#else
8948 cShift &= 63;
8949 return (u64 << cShift) | (u64 >> (64 - cShift));
8950#endif
8951}
8952
8953
8954/**
8955 * Rotate 64-bit unsigned value to the right by @a cShift.
8956 *
8957 * @returns Rotated value.
8958 * @param u64 The value to rotate.
8959 * @param cShift How many bits to rotate by.
8960 */
8961DECLINLINE(uint64_t) ASMRotateRightU64(uint64_t u64, uint32_t cShift) RT_NOTHROW_DEF
8962{
8963#if RT_INLINE_ASM_USES_INTRIN
8964 return _rotr64(u64, cShift);
8965
8966#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_AMD64)
8967 __asm__ __volatile__("rorq %b1, %0" : "=g" (u64) : "Jc" (cShift), "0" (u64) : "cc");
8968 return u64;
8969
8970#elif RT_INLINE_ASM_GNU_STYLE && defined(RT_ARCH_X86)
8971 uint32_t uSpill;
8972 __asm__ __volatile__("testb $0x20, %%cl\n\t" /* if (cShift >= 0x20) { swap(u64.hi, u64lo); cShift -= 0x20; } */
8973 "jz 1f\n\t"
8974 "xchgl %%eax, %%edx\n\t"
8975 "1:\n\t"
8976 "andb $0x1f, %%cl\n\t" /* if (cShift & 0x1f) { */
8977 "jz 2f\n\t"
8978 "movl %%edx, %2\n\t" /* save the hi value in %3. */
8979 "shrdl %%cl,%%eax,%%edx\n\t" /* shift the hi value right, feeding LSBits from the low value. */
8980 "shrdl %%cl,%2,%%eax\n\t" /* shift the lo value right, feeding LSBits from the saved hi value. */
8981 "2:\n\t" /* } */
8982 : "=A" (u64)
8983 , "=c" (cShift)
8984 , "=r" (uSpill)
8985 : "0" (u64)
8986 , "1" (cShift)
8987 : "cc");
8988 return u64;
8989
8990# elif defined(RT_ARCH_ARM64)
8991 __asm__ __volatile__("ror %[uRet], %[uVal], %[cShift]\n\t"
8992 : [uRet] "=r" (u64)
8993 : [uVal] "[uRet]" (u64)
8994 , [cShift] "r" ((uint64_t)(cShift & 63))); /** @todo there is an immediate form here */
8995 return u64;
8996
8997#else
8998 cShift &= 63;
8999 return (u64 >> cShift) | (u64 << (64 - cShift));
9000#endif
9001}
9002
9003/** @} */
9004
9005
9006/** @} */
9007
9008/*
9009 * Include #pragma aux definitions for Watcom C/C++.
9010 */
9011#if defined(__WATCOMC__) && ARCH_BITS == 16 && defined(RT_ARCH_X86)
9012# define IPRT_ASM_WATCOM_X86_16_WITH_PRAGMAS
9013# undef IPRT_INCLUDED_asm_watcom_x86_16_h
9014# include "asm-watcom-x86-16.h"
9015#elif defined(__WATCOMC__) && ARCH_BITS == 32 && defined(RT_ARCH_X86)
9016# define IPRT_ASM_WATCOM_X86_32_WITH_PRAGMAS
9017# undef IPRT_INCLUDED_asm_watcom_x86_32_h
9018# include "asm-watcom-x86-32.h"
9019#endif
9020
9021#endif /* !IPRT_INCLUDED_asm_h */
9022
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette