asm-math.h@ 83776

Last change on this file since 83776 was 83776, checked in by vboxsync, 4 years ago
iprt/asm*,++: Contain the intrin.h mess in iprt/win/msvc_intrin.h. Hack it for bs3kit and no-crt to avoid dragging in malloc.h from UCRT with VC++ 14.1. bugref:8489
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 13.1 KB

Line
1	/** @file
2	* IPRT - Assembly Routines for Optimizing some Integers Math Operations.
3	*/
4
5	/*
6	* Copyright (C) 2006-2020 Oracle Corporation
7	*
8	* This file is part of VirtualBox Open Source Edition (OSE), as
9	* available from http://www.virtualbox.org. This file is free software;
10	* you can redistribute it and/or modify it under the terms of the GNU
11	* General Public License (GPL) as published by the Free Software
12	* Foundation, in version 2 as it comes in the "COPYING" file of the
13	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15	*
16	* The contents of this file may alternatively be used under the terms
17	* of the Common Development and Distribution License Version 1.0
18	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19	* VirtualBox OSE distribution, in which case the provisions of the
20	* CDDL are applicable instead of those of the GPL.
21	*
22	* You may elect to license modified versions of this file under the
23	* terms and conditions of either the GPL or the CDDL or both.
24	*/
25
26	#ifndef IPRT_INCLUDED_asm_math_h
27	#define IPRT_INCLUDED_asm_math_h
28	#ifndef RT_WITHOUT_PRAGMA_ONCE
29	# pragma once
30	#endif
31
32	#include <iprt/types.h>
33
34	#if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN
35	/* Emit the intrinsics at all optimization levels. */
36	# include <iprt/win/msvc_intrin.h>
37	# pragma intrinsic(__emul)
38	# pragma intrinsic(__emulu)
39	# ifdef RT_ARCH_AMD64
40	# pragma intrinsic(_mul128)
41	# pragma intrinsic(_umul128)
42	# endif
43	#endif
44
45
46	/** @defgroup grp_rt_asm_math Interger Math Optimizations
47	* @ingroup grp_rt_asm
48	* @{ */
49
50	/**
51	* Multiplies two unsigned 32-bit values returning an unsigned 64-bit result.
52	*
53	* @returns u32F1 * u32F2.
54	*/
55
56	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
57	DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2);
58	#else
59	DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2)
60	{
61	# ifdef RT_ARCH_X86
62	uint64_t u64;
63	# if RT_INLINE_ASM_GNU_STYLE
64	__asm__ __volatile__("mull %%edx"
65	: "=A" (u64)
66	: "a" (u32F2), "d" (u32F1));
67	# elif RT_INLINE_ASM_USES_INTRIN
68	u64 = __emulu(u32F1, u32F2);
69	# else
70	__asm
71	{
72	mov edx, [u32F1]
73	mov eax, [u32F2]
74	mul edx
75	mov dword ptr [u64], eax
76	mov dword ptr [u64 + 4], edx
77	}
78	# endif
79	return u64;
80	# else /* generic: */
81	return (uint64_t)u32F1 * u32F2;
82	# endif
83	}
84	#endif
85
86
87	/**
88	* Multiplies two signed 32-bit values returning a signed 64-bit result.
89	*
90	* @returns u32F1 * u32F2.
91	*/
92	#if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86)
93	DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2);
94	#else
95	DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2)
96	{
97	# ifdef RT_ARCH_X86
98	int64_t i64;
99	# if RT_INLINE_ASM_GNU_STYLE
100	__asm__ __volatile__("imull %%edx"
101	: "=A" (i64)
102	: "a" (i32F2), "d" (i32F1));
103	# elif RT_INLINE_ASM_USES_INTRIN
104	i64 = __emul(i32F1, i32F2);
105	# else
106	__asm
107	{
108	mov edx, [i32F1]
109	mov eax, [i32F2]
110	imul edx
111	mov dword ptr [i64], eax
112	mov dword ptr [i64 + 4], edx
113	}
114	# endif
115	return i64;
116	# else /* generic: */
117	return (int64_t)i32F1 * i32F2;
118	# endif
119	}
120	#endif
121
122
123	#if ARCH_BITS == 64
124	DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi)
125	{
126	# if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE \|\| RT_INLINE_ASM_USES_INTRIN)
127	# if RT_INLINE_ASM_GNU_STYLE
128	uint64_t u64Low, u64High;
129	__asm__ __volatile__("mulq %%rdx"
130	: "=a" (u64Low), "=d" (u64High)
131	: "0" (u64F1), "1" (u64F2));
132	*pu64ProdHi = u64High;
133	return u64Low;
134	# elif RT_INLINE_ASM_USES_INTRIN
135	return _umul128(u64F1, u64F2, pu64ProdHi);
136	# else
137	# error "hmm"
138	# endif
139	# else /* generic: */
140	/*
141	* F1 * F2 = Prod
142	* -- --
143	* ab * cd = bd + ad10 + bc10 + ac*100
144	*
145	* Where a, b, c and d are 'digits', and 10 is max digit + 1.
146	*
147	* Our digits are 32-bit wide, so instead of 10 we multiply by 4G.
148	* Prod = F1.s.LoF2.s.Lo + F1.s.HiF2.s.Lo*4G
149	* + F1.s.LoF2.s.Hi4G + F1.s.HiF2.s.Hi4G*4G
150	*/
151	RTUINT128U Prod;
152	RTUINT64U Tmp1;
153	uint64_t u64Tmp;
154	RTUINT64U F1, F2;
155	F1.u = u64F1;
156	F2.u = u64F2;
157
158	Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo);
159
160	Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo);
161	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
162	Prod.DWords.dw1 = (uint32_t)u64Tmp;
163	Prod.s.Hi = Tmp1.s.Hi;
164	Prod.s.Hi += u64Tmp >> 32; /* carry */
165
166	Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi);
167	u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo;
168	Prod.DWords.dw1 = (uint32_t)u64Tmp;
169	u64Tmp >>= 32; /* carry */
170	u64Tmp += Prod.DWords.dw2;
171	u64Tmp += Tmp1.s.Hi;
172	Prod.DWords.dw2 = (uint32_t)u64Tmp;
173	Prod.DWords.dw3 += u64Tmp >> 32; /* carry */
174
175	Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi);
176	*pu64ProdHi = Prod.s.Hi;
177	return Prod.s.Lo;
178	# endif
179	}
180	#endif
181
182
183
184	/**
185	* Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result.
186	*
187	* @returns u64 / u32.
188	*/
189	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
190	DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32);
191	#else
192	DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32)
193	{
194	# ifdef RT_ARCH_X86
195	# if RT_INLINE_ASM_GNU_STYLE
196	RTCCUINTREG uDummy;
197	__asm__ __volatile__("divl %3"
198	: "=a" (u32), "=d"(uDummy)
199	: "A" (u64), "r" (u32));
200	# else
201	__asm
202	{
203	mov eax, dword ptr [u64]
204	mov edx, dword ptr [u64 + 4]
205	mov ecx, [u32]
206	div ecx
207	mov [u32], eax
208	}
209	# endif
210	return u32;
211	# else /* generic: */
212	return (uint32_t)(u64 / u32);
213	# endif
214	}
215	#endif
216
217
218	/**
219	* Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result.
220	*
221	* @returns u64 / u32.
222	*/
223	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
224	DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32);
225	#else
226	DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32)
227	{
228	# ifdef RT_ARCH_X86
229	# if RT_INLINE_ASM_GNU_STYLE
230	RTCCUINTREG iDummy;
231	__asm__ __volatile__("idivl %3"
232	: "=a" (i32), "=d"(iDummy)
233	: "A" (i64), "r" (i32));
234	# else
235	__asm
236	{
237	mov eax, dword ptr [i64]
238	mov edx, dword ptr [i64 + 4]
239	mov ecx, [i32]
240	idiv ecx
241	mov [i32], eax
242	}
243	# endif
244	return i32;
245	# else /* generic: */
246	return (int32_t)(i64 / i32);
247	# endif
248	}
249	#endif
250
251
252	/**
253	* Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result,
254	* returning the rest.
255	*
256	* @returns u64 % u32.
257	*
258	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
259	*/
260	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
261	DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32);
262	#else
263	DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32)
264	{
265	# ifdef RT_ARCH_X86
266	# if RT_INLINE_ASM_GNU_STYLE
267	RTCCUINTREG uDummy;
268	__asm__ __volatile__("divl %3"
269	: "=a" (uDummy), "=d"(u32)
270	: "A" (u64), "r" (u32));
271	# else
272	__asm
273	{
274	mov eax, dword ptr [u64]
275	mov edx, dword ptr [u64 + 4]
276	mov ecx, [u32]
277	div ecx
278	mov [u32], edx
279	}
280	# endif
281	return u32;
282	# else /* generic: */
283	return (uint32_t)(u64 % u32);
284	# endif
285	}
286	#endif
287
288
289	/**
290	* Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result,
291	* returning the rest.
292	*
293	* @returns u64 % u32.
294	*
295	* @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash.
296	*/
297	#if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86)
298	DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32);
299	#else
300	DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32)
301	{
302	# ifdef RT_ARCH_X86
303	# if RT_INLINE_ASM_GNU_STYLE
304	RTCCUINTREG iDummy;
305	__asm__ __volatile__("idivl %3"
306	: "=a" (iDummy), "=d"(i32)
307	: "A" (i64), "r" (i32));
308	# else
309	__asm
310	{
311	mov eax, dword ptr [i64]
312	mov edx, dword ptr [i64 + 4]
313	mov ecx, [i32]
314	idiv ecx
315	mov [i32], edx
316	}
317	# endif
318	return i32;
319	# else /* generic: */
320	return (int32_t)(i64 % i32);
321	# endif
322	}
323	#endif
324
325
326	/**
327	* Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer
328	* using a 64 bit intermediate result.
329	*
330	* @returns (u32A * u32B) / u32C.
331	* @param u32A The 32-bit value (A).
332	* @param u32B The 32-bit value to multiple by A.
333	* @param u32C The 32-bit value to divide A*B by.
334	*
335	* @remarks Architecture specific.
336	* @remarks Make sure the result won't ever exceed 32-bit, because hardware
337	* exception may be raised if it does.
338	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
339	* arithmetics functions.
340	*/
341	#if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
342	DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C);
343	#else
344	DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C)
345	{
346	# if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) \|\| defined(RT_ARCH_X86))
347	uint32_t u32Result, u32Spill;
348	__asm__ __volatile__("mull %2\n\t"
349	"divl %3\n\t"
350	: "=&a" (u32Result),
351	"=&d" (u32Spill)
352	: "r" (u32B),
353	"r" (u32C),
354	"0" (u32A));
355	return u32Result;
356	# else
357	return (uint32_t)(((uint64_t)u32A * u32B) / u32C);
358	# endif
359	}
360	#endif
361
362
363	/**
364	* Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer
365	* using a 96 bit intermediate result.
366	*
367	* @returns (u64A * u32B) / u32C.
368	* @param u64A The 64-bit value.
369	* @param u32B The 32-bit value to multiple by A.
370	* @param u32C The 32-bit value to divide A*B by.
371	*
372	* @remarks Architecture specific.
373	* @remarks Make sure the result won't ever exceed 64-bit, because hardware
374	* exception may be raised if it does.
375	* @remarks On x86 this may be used to avoid dragging in 64-bit builtin
376	* arithmetics function.
377	*/
378	#if RT_INLINE_ASM_EXTERNAL \|\| !defined(__GNUC__) \|\| (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86))
379	DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C);
380	#else
381	DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C)
382	{
383	# if RT_INLINE_ASM_GNU_STYLE
384	# ifdef RT_ARCH_AMD64
385	uint64_t u64Result, u64Spill;
386	__asm__ __volatile__("mulq %2\n\t"
387	"divq %3\n\t"
388	: "=&a" (u64Result),
389	"=&d" (u64Spill)
390	: "r" ((uint64_t)u32B),
391	"r" ((uint64_t)u32C),
392	"0" (u64A));
393	return u64Result;
394	# else
395	uint32_t u32Dummy;
396	uint64_t u64Result;
397	__asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo
398	edx = u64Lo.hi = (u64A.lo * u32B).hi */
399	"xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo
400	eax = u64A.hi */
401	"xchg %%edx,%%edi \n\t" /* edi = u64Low.hi
402	edx = u32C */
403	"xchg %%edx,%%ecx \n\t" /* ecx = u32C
404	edx = u32B */
405	"mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo
406	edx = u64Hi.hi = (u64A.hi * u32B).hi */
407	"addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */
408	"adcl $0,%%edx \n\t" /* u64Hi.hi += carry */
409	"divl %%ecx \n\t" /* eax = u64Hi / u32C
410	edx = u64Hi % u32C */
411	"movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */
412	"movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */
413	"divl %%ecx \n\t" /* u64Result.lo */
414	"movl %%edi,%%edx \n\t" /* u64Result.hi */
415	: "=A"(u64Result), "=c"(u32Dummy),
416	"=S"(u32Dummy), "=D"(u32Dummy)
417	: "a"((uint32_t)u64A),
418	"S"((uint32_t)(u64A >> 32)),
419	"c"(u32B),
420	"D"(u32C));
421	return u64Result;
422	# endif
423	# else
424	RTUINT64U u;
425	uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B;
426	uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B;
427	u64Hi += (u64Lo >> 32);
428	u.s.Hi = (uint32_t)(u64Hi / u32C);
429	u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C);
430	return u.u;
431	# endif
432	}
433	#endif
434
435	/** @} */
436	#endif /* !IPRT_INCLUDED_asm_math_h */
437

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/include/iprt/asm-math.h@ 83776

Download in other formats: