VirtualBox

source: vbox/trunk/src/VBox/Runtime/r0drv/linux/memobj-r0drv-linux.c@ 93115

Last change on this file since 93115 was 93115, checked in by vboxsync, 3 years ago

scm --update-copyright-year

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id Rev Revision
File size: 70.4 KB
Line 
1/* $Id: memobj-r0drv-linux.c 93115 2022-01-01 11:31:46Z vboxsync $ */
2/** @file
3 * IPRT - Ring-0 Memory Objects, Linux.
4 */
5
6/*
7 * Copyright (C) 2006-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*********************************************************************************************************************************
29* Header Files *
30*********************************************************************************************************************************/
31#include "the-linux-kernel.h"
32
33#include <iprt/memobj.h>
34#include <iprt/assert.h>
35#include <iprt/err.h>
36#include <iprt/log.h>
37#include <iprt/mem.h>
38#include <iprt/process.h>
39#include <iprt/string.h>
40#include "internal/memobj.h"
41#include "internal/iprt.h"
42
43
44/*********************************************************************************************************************************
45* Defined Constants And Macros *
46*********************************************************************************************************************************/
47/* early 2.6 kernels */
48#ifndef PAGE_SHARED_EXEC
49# define PAGE_SHARED_EXEC PAGE_SHARED
50#endif
51#ifndef PAGE_READONLY_EXEC
52# define PAGE_READONLY_EXEC PAGE_READONLY
53#endif
54
55/** @def IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
56 * Whether we use alloc_vm_area (3.2+) for executable memory.
57 * This is a must for 5.8+, but we enable it all the way back to 3.2.x for
58 * better W^R compliance (fExecutable flag). */
59#if RTLNX_VER_RANGE(3,2,0, 5,10,0) || defined(DOXYGEN_RUNNING)
60# define IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
61#endif
62/** @def IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
63 * alloc_vm_area was removed with 5.10 so we have to resort to a different way
64 * to allocate executable memory.
65 * It would be possible to remove IPRT_USE_ALLOC_VM_AREA_FOR_EXEC and use
66 * this path execlusively for 3.2+ but no time to test it really works on every
67 * supported kernel, so better play safe for now.
68 */
69#if RTLNX_VER_MIN(5,10,0) || defined(DOXYGEN_RUNNING)
70# define IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
71#endif
72
73/*
74 * 2.6.29+ kernels don't work with remap_pfn_range() anymore because
75 * track_pfn_vma_new() is apparently not defined for non-RAM pages.
76 * It should be safe to use vm_insert_page() older kernels as well.
77 */
78#if RTLNX_VER_MIN(2,6,23)
79# define VBOX_USE_INSERT_PAGE
80#endif
81#if defined(CONFIG_X86_PAE) \
82 && ( defined(HAVE_26_STYLE_REMAP_PAGE_RANGE) \
83 || RTLNX_VER_RANGE(2,6,0, 2,6,11) )
84# define VBOX_USE_PAE_HACK
85#endif
86
87/* gfp_t was introduced in 2.6.14, define it for earlier. */
88#if RTLNX_VER_MAX(2,6,14)
89# define gfp_t unsigned
90#endif
91
92/*
93 * Wrappers around mmap_lock/mmap_sem difference.
94 */
95#if RTLNX_VER_MIN(5,8,0)
96# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_lock)
97# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_lock)
98# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_lock)
99# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_lock)
100#else
101# define LNX_MM_DOWN_READ(a_pMm) down_read(&(a_pMm)->mmap_sem)
102# define LNX_MM_UP_READ(a_pMm) up_read(&(a_pMm)->mmap_sem)
103# define LNX_MM_DOWN_WRITE(a_pMm) down_write(&(a_pMm)->mmap_sem)
104# define LNX_MM_UP_WRITE(a_pMm) up_write(&(a_pMm)->mmap_sem)
105#endif
106
107
108/*********************************************************************************************************************************
109* Structures and Typedefs *
110*********************************************************************************************************************************/
111/**
112 * The Linux version of the memory object structure.
113 */
114typedef struct RTR0MEMOBJLNX
115{
116 /** The core structure. */
117 RTR0MEMOBJINTERNAL Core;
118 /** Set if the allocation is contiguous.
119 * This means it has to be given back as one chunk. */
120 bool fContiguous;
121 /** Set if executable allocation. */
122 bool fExecutable;
123 /** Set if we've vmap'ed the memory into ring-0. */
124 bool fMappedToRing0;
125 /** This is non-zero if large page allocation. */
126 uint8_t cLargePageOrder;
127#ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
128 /** Return from alloc_vm_area() that we now need to use for executable
129 * memory. */
130 struct vm_struct *pArea;
131 /** PTE array that goes along with pArea (must be freed). */
132 pte_t **papPtesForArea;
133#endif
134 /** The pages in the apPages array. */
135 size_t cPages;
136 /** Array of struct page pointers. (variable size) */
137 struct page *apPages[1];
138} RTR0MEMOBJLNX;
139/** Pointer to the linux memory object. */
140typedef RTR0MEMOBJLNX *PRTR0MEMOBJLNX;
141
142
143static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx);
144
145
146/**
147 * Helper that converts from a RTR0PROCESS handle to a linux task.
148 *
149 * @returns The corresponding Linux task.
150 * @param R0Process IPRT ring-0 process handle.
151 */
152static struct task_struct *rtR0ProcessToLinuxTask(RTR0PROCESS R0Process)
153{
154 /** @todo fix rtR0ProcessToLinuxTask!! */
155 /** @todo many (all?) callers currently assume that we return 'current'! */
156 return R0Process == RTR0ProcHandleSelf() ? current : NULL;
157}
158
159
160/**
161 * Compute order. Some functions allocate 2^order pages.
162 *
163 * @returns order.
164 * @param cPages Number of pages.
165 */
166static int rtR0MemObjLinuxOrder(size_t cPages)
167{
168 int iOrder;
169 size_t cTmp;
170
171 for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder)
172 ;
173 if (cPages & ~((size_t)1 << iOrder))
174 ++iOrder;
175
176 return iOrder;
177}
178
179
180/**
181 * Converts from RTMEM_PROT_* to Linux PAGE_*.
182 *
183 * @returns Linux page protection constant.
184 * @param fProt The IPRT protection mask.
185 * @param fKernel Whether it applies to kernel or user space.
186 */
187static pgprot_t rtR0MemObjLinuxConvertProt(unsigned fProt, bool fKernel)
188{
189 switch (fProt)
190 {
191 default:
192 AssertMsgFailed(("%#x %d\n", fProt, fKernel)); RT_FALL_THRU();
193 case RTMEM_PROT_NONE:
194 return PAGE_NONE;
195
196 case RTMEM_PROT_READ:
197 return fKernel ? PAGE_KERNEL_RO : PAGE_READONLY;
198
199 case RTMEM_PROT_WRITE:
200 case RTMEM_PROT_WRITE | RTMEM_PROT_READ:
201 return fKernel ? PAGE_KERNEL : PAGE_SHARED;
202
203 case RTMEM_PROT_EXEC:
204 case RTMEM_PROT_EXEC | RTMEM_PROT_READ:
205#if defined(RT_ARCH_X86) || defined(RT_ARCH_AMD64)
206 if (fKernel)
207 {
208 pgprot_t fPg = MY_PAGE_KERNEL_EXEC;
209 pgprot_val(fPg) &= ~_PAGE_RW;
210 return fPg;
211 }
212 return PAGE_READONLY_EXEC;
213#else
214 return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_READONLY_EXEC;
215#endif
216
217 case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC:
218 case RTMEM_PROT_WRITE | RTMEM_PROT_EXEC | RTMEM_PROT_READ:
219 return fKernel ? MY_PAGE_KERNEL_EXEC : PAGE_SHARED_EXEC;
220 }
221}
222
223
224/**
225 * Worker for rtR0MemObjNativeReserveUser and rtR0MemObjNativerMapUser that creates
226 * an empty user space mapping.
227 *
228 * We acquire the mmap_sem/mmap_lock of the task!
229 *
230 * @returns Pointer to the mapping.
231 * (void *)-1 on failure.
232 * @param R3PtrFixed (RTR3PTR)-1 if anywhere, otherwise a specific location.
233 * @param cb The size of the mapping.
234 * @param uAlignment The alignment of the mapping.
235 * @param pTask The Linux task to create this mapping in.
236 * @param fProt The RTMEM_PROT_* mask.
237 */
238static void *rtR0MemObjLinuxDoMmap(RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment, struct task_struct *pTask, unsigned fProt)
239{
240 unsigned fLnxProt;
241 unsigned long ulAddr;
242
243 Assert(pTask == current); /* do_mmap */
244 RT_NOREF_PV(pTask);
245
246 /*
247 * Convert from IPRT protection to mman.h PROT_ and call do_mmap.
248 */
249 fProt &= (RTMEM_PROT_NONE | RTMEM_PROT_READ | RTMEM_PROT_WRITE | RTMEM_PROT_EXEC);
250 if (fProt == RTMEM_PROT_NONE)
251 fLnxProt = PROT_NONE;
252 else
253 {
254 fLnxProt = 0;
255 if (fProt & RTMEM_PROT_READ)
256 fLnxProt |= PROT_READ;
257 if (fProt & RTMEM_PROT_WRITE)
258 fLnxProt |= PROT_WRITE;
259 if (fProt & RTMEM_PROT_EXEC)
260 fLnxProt |= PROT_EXEC;
261 }
262
263 if (R3PtrFixed != (RTR3PTR)-1)
264 {
265#if RTLNX_VER_MIN(3,5,0)
266 ulAddr = vm_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0);
267#else
268 LNX_MM_DOWN_WRITE(pTask->mm);
269 ulAddr = do_mmap(NULL, R3PtrFixed, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, 0);
270 LNX_MM_UP_WRITE(pTask->mm);
271#endif
272 }
273 else
274 {
275#if RTLNX_VER_MIN(3,5,0)
276 ulAddr = vm_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0);
277#else
278 LNX_MM_DOWN_WRITE(pTask->mm);
279 ulAddr = do_mmap(NULL, 0, cb, fLnxProt, MAP_SHARED | MAP_ANONYMOUS, 0);
280 LNX_MM_UP_WRITE(pTask->mm);
281#endif
282 if ( !(ulAddr & ~PAGE_MASK)
283 && (ulAddr & (uAlignment - 1)))
284 {
285 /** @todo implement uAlignment properly... We'll probably need to make some dummy mappings to fill
286 * up alignment gaps. This is of course complicated by fragmentation (which we might have cause
287 * ourselves) and further by there begin two mmap strategies (top / bottom). */
288 /* For now, just ignore uAlignment requirements... */
289 }
290 }
291
292
293 if (ulAddr & ~PAGE_MASK) /* ~PAGE_MASK == PAGE_OFFSET_MASK */
294 return (void *)-1;
295 return (void *)ulAddr;
296}
297
298
299/**
300 * Worker that destroys a user space mapping.
301 * Undoes what rtR0MemObjLinuxDoMmap did.
302 *
303 * We acquire the mmap_sem/mmap_lock of the task!
304 *
305 * @param pv The ring-3 mapping.
306 * @param cb The size of the mapping.
307 * @param pTask The Linux task to destroy this mapping in.
308 */
309static void rtR0MemObjLinuxDoMunmap(void *pv, size_t cb, struct task_struct *pTask)
310{
311#if RTLNX_VER_MIN(3,5,0)
312 Assert(pTask == current); RT_NOREF_PV(pTask);
313 vm_munmap((unsigned long)pv, cb);
314#elif defined(USE_RHEL4_MUNMAP)
315 LNX_MM_DOWN_WRITE(pTask->mm);
316 do_munmap(pTask->mm, (unsigned long)pv, cb, 0); /* should it be 1 or 0? */
317 LNX_MM_UP_WRITE(pTask->mm);
318#else
319 LNX_MM_DOWN_WRITE(pTask->mm);
320 do_munmap(pTask->mm, (unsigned long)pv, cb);
321 LNX_MM_UP_WRITE(pTask->mm);
322#endif
323}
324
325
326/**
327 * Internal worker that allocates physical pages and creates the memory object for them.
328 *
329 * @returns IPRT status code.
330 * @param ppMemLnx Where to store the memory object pointer.
331 * @param enmType The object type.
332 * @param cb The number of bytes to allocate.
333 * @param uAlignment The alignment of the physical memory.
334 * Only valid if fContiguous == true, ignored otherwise.
335 * @param fFlagsLnx The page allocation flags (GPFs).
336 * @param fContiguous Whether the allocation must be contiguous.
337 * @param fExecutable Whether the memory must be executable.
338 * @param rcNoMem What to return when we're out of pages.
339 * @param pszTag Allocation tag used for statistics and such.
340 */
341static int rtR0MemObjLinuxAllocPages(PRTR0MEMOBJLNX *ppMemLnx, RTR0MEMOBJTYPE enmType, size_t cb,
342 size_t uAlignment, gfp_t fFlagsLnx, bool fContiguous, bool fExecutable, int rcNoMem,
343 const char *pszTag)
344{
345 size_t iPage;
346 size_t const cPages = cb >> PAGE_SHIFT;
347 struct page *paPages;
348
349 /*
350 * Allocate a memory object structure that's large enough to contain
351 * the page pointer array.
352 */
353 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), enmType,
354 NULL, cb, pszTag);
355 if (!pMemLnx)
356 return VERR_NO_MEMORY;
357 pMemLnx->Core.fFlags |= RTR0MEMOBJ_FLAGS_UNINITIALIZED_AT_ALLOC;
358 pMemLnx->cPages = cPages;
359
360 if (cPages > 255)
361 {
362# ifdef __GFP_REPEAT
363 /* Try hard to allocate the memory, but the allocation attempt might fail. */
364 fFlagsLnx |= __GFP_REPEAT;
365# endif
366# ifdef __GFP_NOMEMALLOC
367 /* Introduced with Linux 2.6.12: Don't use emergency reserves */
368 fFlagsLnx |= __GFP_NOMEMALLOC;
369# endif
370 }
371
372 /*
373 * Allocate the pages.
374 * For small allocations we'll try contiguous first and then fall back on page by page.
375 */
376#if RTLNX_VER_MIN(2,4,22)
377 if ( fContiguous
378 || cb <= PAGE_SIZE * 2)
379 {
380# ifdef VBOX_USE_INSERT_PAGE
381 paPages = alloc_pages(fFlagsLnx | __GFP_COMP | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages));
382# else
383 paPages = alloc_pages(fFlagsLnx | __GFP_NOWARN, rtR0MemObjLinuxOrder(cPages));
384# endif
385 if (paPages)
386 {
387 fContiguous = true;
388 for (iPage = 0; iPage < cPages; iPage++)
389 pMemLnx->apPages[iPage] = &paPages[iPage];
390 }
391 else if (fContiguous)
392 {
393 rtR0MemObjDelete(&pMemLnx->Core);
394 return rcNoMem;
395 }
396 }
397
398 if (!fContiguous)
399 {
400 /** @todo Try use alloc_pages_bulk_array when available, it should be faster
401 * than a alloc_page loop. Put it in #ifdefs similar to
402 * IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC. */
403 for (iPage = 0; iPage < cPages; iPage++)
404 {
405 pMemLnx->apPages[iPage] = alloc_page(fFlagsLnx | __GFP_NOWARN);
406 if (RT_UNLIKELY(!pMemLnx->apPages[iPage]))
407 {
408 while (iPage-- > 0)
409 __free_page(pMemLnx->apPages[iPage]);
410 rtR0MemObjDelete(&pMemLnx->Core);
411 return rcNoMem;
412 }
413 }
414 }
415
416#else /* < 2.4.22 */
417 /** @todo figure out why we didn't allocate page-by-page on 2.4.21 and older... */
418 paPages = alloc_pages(fFlagsLnx, rtR0MemObjLinuxOrder(cPages));
419 if (!paPages)
420 {
421 rtR0MemObjDelete(&pMemLnx->Core);
422 return rcNoMem;
423 }
424 for (iPage = 0; iPage < cPages; iPage++)
425 {
426 pMemLnx->apPages[iPage] = &paPages[iPage];
427 if (fExecutable)
428 MY_SET_PAGES_EXEC(pMemLnx->apPages[iPage], 1);
429 if (PageHighMem(pMemLnx->apPages[iPage]))
430 BUG();
431 }
432
433 fContiguous = true;
434#endif /* < 2.4.22 */
435 pMemLnx->fContiguous = fContiguous;
436 pMemLnx->fExecutable = fExecutable;
437
438#if RTLNX_VER_MAX(4,5,0)
439 /*
440 * Reserve the pages.
441 *
442 * Linux >= 4.5 with CONFIG_DEBUG_VM panics when setting PG_reserved on compound
443 * pages. According to Michal Hocko this shouldn't be necessary anyway because
444 * as pages which are not on the LRU list are never evictable.
445 */
446 for (iPage = 0; iPage < cPages; iPage++)
447 SetPageReserved(pMemLnx->apPages[iPage]);
448#endif
449
450 /*
451 * Note that the physical address of memory allocated with alloc_pages(flags, order)
452 * is always 2^(PAGE_SHIFT+order)-aligned.
453 */
454 if ( fContiguous
455 && uAlignment > PAGE_SIZE)
456 {
457 /*
458 * Check for alignment constraints. The physical address of memory allocated with
459 * alloc_pages(flags, order) is always 2^(PAGE_SHIFT+order)-aligned.
460 */
461 if (RT_UNLIKELY(page_to_phys(pMemLnx->apPages[0]) & (uAlignment - 1)))
462 {
463 /*
464 * This should never happen!
465 */
466 printk("rtR0MemObjLinuxAllocPages(cb=0x%lx, uAlignment=0x%lx): alloc_pages(..., %d) returned physical memory at 0x%lx!\n",
467 (unsigned long)cb, (unsigned long)uAlignment, rtR0MemObjLinuxOrder(cPages), (unsigned long)page_to_phys(pMemLnx->apPages[0]));
468 rtR0MemObjLinuxFreePages(pMemLnx);
469 return rcNoMem;
470 }
471 }
472
473 *ppMemLnx = pMemLnx;
474 return VINF_SUCCESS;
475}
476
477
478/**
479 * Frees the physical pages allocated by the rtR0MemObjLinuxAllocPages() call.
480 *
481 * This method does NOT free the object.
482 *
483 * @param pMemLnx The object which physical pages should be freed.
484 */
485static void rtR0MemObjLinuxFreePages(PRTR0MEMOBJLNX pMemLnx)
486{
487 size_t iPage = pMemLnx->cPages;
488 if (iPage > 0)
489 {
490 /*
491 * Restore the page flags.
492 */
493 while (iPage-- > 0)
494 {
495#if RTLNX_VER_MAX(4,5,0)
496 /* See SetPageReserved() in rtR0MemObjLinuxAllocPages() */
497 ClearPageReserved(pMemLnx->apPages[iPage]);
498#endif
499#if RTLNX_VER_MAX(2,4,22)
500 if (pMemLnx->fExecutable)
501 MY_SET_PAGES_NOEXEC(pMemLnx->apPages[iPage], 1);
502#endif
503 }
504
505 /*
506 * Free the pages.
507 */
508#if RTLNX_VER_MIN(2,4,22)
509 if (!pMemLnx->fContiguous)
510 {
511 iPage = pMemLnx->cPages;
512 while (iPage-- > 0)
513 __free_page(pMemLnx->apPages[iPage]);
514 }
515 else
516#endif
517 __free_pages(pMemLnx->apPages[0], rtR0MemObjLinuxOrder(pMemLnx->cPages));
518
519 pMemLnx->cPages = 0;
520 }
521}
522
523
524#ifdef IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC
525/**
526 * User data passed to the apply_to_page_range() callback.
527 */
528typedef struct LNXAPPLYPGRANGE
529{
530 /** Pointer to the memory object. */
531 PRTR0MEMOBJLNX pMemLnx;
532 /** The page protection flags to apply. */
533 pgprot_t fPg;
534} LNXAPPLYPGRANGE;
535/** Pointer to the user data. */
536typedef LNXAPPLYPGRANGE *PLNXAPPLYPGRANGE;
537/** Pointer to the const user data. */
538typedef const LNXAPPLYPGRANGE *PCLNXAPPLYPGRANGE;
539
540/**
541 * Callback called in apply_to_page_range().
542 *
543 * @returns Linux status code.
544 * @param pPte Pointer to the page table entry for the given address.
545 * @param uAddr The address to apply the new protection to.
546 * @param pvUser The opaque user data.
547 */
548static int rtR0MemObjLinuxApplyPageRange(pte_t *pPte, unsigned long uAddr, void *pvUser)
549{
550 PCLNXAPPLYPGRANGE pArgs = (PCLNXAPPLYPGRANGE)pvUser;
551 PRTR0MEMOBJLNX pMemLnx = pArgs->pMemLnx;
552 size_t idxPg = (uAddr - (unsigned long)pMemLnx->Core.pv) >> PAGE_SHIFT;
553
554 set_pte(pPte, mk_pte(pMemLnx->apPages[idxPg], pArgs->fPg));
555 return 0;
556}
557#endif
558
559
560/**
561 * Maps the allocation into ring-0.
562 *
563 * This will update the RTR0MEMOBJLNX::Core.pv and RTR0MEMOBJ::fMappedToRing0 members.
564 *
565 * Contiguous mappings that isn't in 'high' memory will already be mapped into kernel
566 * space, so we'll use that mapping if possible. If execute access is required, we'll
567 * play safe and do our own mapping.
568 *
569 * @returns IPRT status code.
570 * @param pMemLnx The linux memory object to map.
571 * @param fExecutable Whether execute access is required.
572 */
573static int rtR0MemObjLinuxVMap(PRTR0MEMOBJLNX pMemLnx, bool fExecutable)
574{
575 int rc = VINF_SUCCESS;
576
577 /*
578 * Choose mapping strategy.
579 */
580 bool fMustMap = fExecutable
581 || !pMemLnx->fContiguous;
582 if (!fMustMap)
583 {
584 size_t iPage = pMemLnx->cPages;
585 while (iPage-- > 0)
586 if (PageHighMem(pMemLnx->apPages[iPage]))
587 {
588 fMustMap = true;
589 break;
590 }
591 }
592
593 Assert(!pMemLnx->Core.pv);
594 Assert(!pMemLnx->fMappedToRing0);
595
596 if (fMustMap)
597 {
598 /*
599 * Use vmap - 2.4.22 and later.
600 */
601#if RTLNX_VER_MIN(2,4,22)
602 pgprot_t fPg;
603 pgprot_val(fPg) = _PAGE_PRESENT | _PAGE_RW;
604# ifdef _PAGE_NX
605 if (!fExecutable)
606 pgprot_val(fPg) |= _PAGE_NX;
607# endif
608
609# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
610 if (fExecutable)
611 {
612# if RTLNX_VER_MIN(3,2,51)
613 pte_t **papPtes = (pte_t **)kmalloc_array(pMemLnx->cPages, sizeof(papPtes[0]), GFP_KERNEL);
614# else
615 pte_t **papPtes = (pte_t **)kmalloc(pMemLnx->cPages * sizeof(papPtes[0]), GFP_KERNEL);
616# endif
617 if (papPtes)
618 {
619 pMemLnx->pArea = alloc_vm_area(pMemLnx->Core.cb, papPtes); /* Note! pArea->nr_pages is not set. */
620 if (pMemLnx->pArea)
621 {
622 size_t i;
623 Assert(pMemLnx->pArea->size >= pMemLnx->Core.cb); /* Note! includes guard page. */
624 Assert(pMemLnx->pArea->addr);
625# ifdef _PAGE_NX
626 pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */
627# endif
628 pMemLnx->papPtesForArea = papPtes;
629 for (i = 0; i < pMemLnx->cPages; i++)
630 *papPtes[i] = mk_pte(pMemLnx->apPages[i], fPg);
631 pMemLnx->Core.pv = pMemLnx->pArea->addr;
632 pMemLnx->fMappedToRing0 = true;
633 }
634 else
635 {
636 kfree(papPtes);
637 rc = VERR_MAP_FAILED;
638 }
639 }
640 else
641 rc = VERR_MAP_FAILED;
642 }
643 else
644# endif
645 {
646# if defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC)
647 if (fExecutable)
648 pgprot_val(fPg) |= _PAGE_NX; /* Uses RTR0MemObjProtect to clear NX when memory ready, W^X fashion. */
649# endif
650
651# ifdef VM_MAP
652 pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_MAP, fPg);
653# else
654 pMemLnx->Core.pv = vmap(&pMemLnx->apPages[0], pMemLnx->cPages, VM_ALLOC, fPg);
655# endif
656 if (pMemLnx->Core.pv)
657 pMemLnx->fMappedToRing0 = true;
658 else
659 rc = VERR_MAP_FAILED;
660 }
661#else /* < 2.4.22 */
662 rc = VERR_NOT_SUPPORTED;
663#endif
664 }
665 else
666 {
667 /*
668 * Use the kernel RAM mapping.
669 */
670 pMemLnx->Core.pv = phys_to_virt(page_to_phys(pMemLnx->apPages[0]));
671 Assert(pMemLnx->Core.pv);
672 }
673
674 return rc;
675}
676
677
678/**
679 * Undoes what rtR0MemObjLinuxVMap() did.
680 *
681 * @param pMemLnx The linux memory object.
682 */
683static void rtR0MemObjLinuxVUnmap(PRTR0MEMOBJLNX pMemLnx)
684{
685#if RTLNX_VER_MIN(2,4,22)
686# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
687 if (pMemLnx->pArea)
688 {
689# if 0
690 pte_t **papPtes = pMemLnx->papPtesForArea;
691 size_t i;
692 for (i = 0; i < pMemLnx->cPages; i++)
693 *papPtes[i] = 0;
694# endif
695 free_vm_area(pMemLnx->pArea);
696 kfree(pMemLnx->papPtesForArea);
697 pMemLnx->pArea = NULL;
698 pMemLnx->papPtesForArea = NULL;
699 }
700 else
701# endif
702 if (pMemLnx->fMappedToRing0)
703 {
704 Assert(pMemLnx->Core.pv);
705 vunmap(pMemLnx->Core.pv);
706 pMemLnx->fMappedToRing0 = false;
707 }
708#else /* < 2.4.22 */
709 Assert(!pMemLnx->fMappedToRing0);
710#endif
711 pMemLnx->Core.pv = NULL;
712}
713
714
715DECLHIDDEN(int) rtR0MemObjNativeFree(RTR0MEMOBJ pMem)
716{
717 IPRT_LINUX_SAVE_EFL_AC();
718 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
719
720 /*
721 * Release any memory that we've allocated or locked.
722 */
723 switch (pMemLnx->Core.enmType)
724 {
725 case RTR0MEMOBJTYPE_PAGE:
726 case RTR0MEMOBJTYPE_LOW:
727 case RTR0MEMOBJTYPE_CONT:
728 case RTR0MEMOBJTYPE_PHYS:
729 case RTR0MEMOBJTYPE_PHYS_NC:
730 rtR0MemObjLinuxVUnmap(pMemLnx);
731 rtR0MemObjLinuxFreePages(pMemLnx);
732 break;
733
734 case RTR0MEMOBJTYPE_LARGE_PAGE:
735 {
736 uint32_t const cLargePages = pMemLnx->Core.cb >> (pMemLnx->cLargePageOrder + PAGE_SHIFT);
737 uint32_t iLargePage;
738 for (iLargePage = 0; iLargePage < cLargePages; iLargePage++)
739 __free_pages(pMemLnx->apPages[iLargePage << pMemLnx->cLargePageOrder], pMemLnx->cLargePageOrder);
740 pMemLnx->cPages = 0;
741
742#ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
743 Assert(!pMemLnx->pArea);
744 Assert(!pMemLnx->papPtesForArea);
745#endif
746 break;
747 }
748
749 case RTR0MEMOBJTYPE_LOCK:
750 if (pMemLnx->Core.u.Lock.R0Process != NIL_RTR0PROCESS)
751 {
752 struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
753 size_t iPage;
754 Assert(pTask);
755 if (pTask && pTask->mm)
756 LNX_MM_DOWN_READ(pTask->mm);
757
758 iPage = pMemLnx->cPages;
759 while (iPage-- > 0)
760 {
761 if (!PageReserved(pMemLnx->apPages[iPage]))
762 SetPageDirty(pMemLnx->apPages[iPage]);
763#if RTLNX_VER_MIN(4,6,0)
764 put_page(pMemLnx->apPages[iPage]);
765#else
766 page_cache_release(pMemLnx->apPages[iPage]);
767#endif
768 }
769
770 if (pTask && pTask->mm)
771 LNX_MM_UP_READ(pTask->mm);
772 }
773 /* else: kernel memory - nothing to do here. */
774 break;
775
776 case RTR0MEMOBJTYPE_RES_VIRT:
777 Assert(pMemLnx->Core.pv);
778 if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS)
779 {
780 struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
781 Assert(pTask);
782 if (pTask && pTask->mm)
783 rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask);
784 }
785 else
786 {
787 vunmap(pMemLnx->Core.pv);
788
789 Assert(pMemLnx->cPages == 1 && pMemLnx->apPages[0] != NULL);
790 __free_page(pMemLnx->apPages[0]);
791 pMemLnx->apPages[0] = NULL;
792 pMemLnx->cPages = 0;
793 }
794 pMemLnx->Core.pv = NULL;
795 break;
796
797 case RTR0MEMOBJTYPE_MAPPING:
798 Assert(pMemLnx->cPages == 0); Assert(pMemLnx->Core.pv);
799 if (pMemLnx->Core.u.ResVirt.R0Process != NIL_RTR0PROCESS)
800 {
801 struct task_struct *pTask = rtR0ProcessToLinuxTask(pMemLnx->Core.u.Lock.R0Process);
802 Assert(pTask);
803 if (pTask && pTask->mm)
804 rtR0MemObjLinuxDoMunmap(pMemLnx->Core.pv, pMemLnx->Core.cb, pTask);
805 }
806 else
807 vunmap(pMemLnx->Core.pv);
808 pMemLnx->Core.pv = NULL;
809 break;
810
811 default:
812 AssertMsgFailed(("enmType=%d\n", pMemLnx->Core.enmType));
813 return VERR_INTERNAL_ERROR;
814 }
815 IPRT_LINUX_RESTORE_EFL_ONLY_AC();
816 return VINF_SUCCESS;
817}
818
819
820DECLHIDDEN(int) rtR0MemObjNativeAllocPage(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
821{
822 IPRT_LINUX_SAVE_EFL_AC();
823 PRTR0MEMOBJLNX pMemLnx;
824 int rc;
825
826#if RTLNX_VER_MIN(2,4,22)
827 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_HIGHUSER,
828 false /* non-contiguous */, fExecutable, VERR_NO_MEMORY, pszTag);
829#else
830 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_PAGE, cb, PAGE_SIZE, GFP_USER,
831 false /* non-contiguous */, fExecutable, VERR_NO_MEMORY, pszTag);
832#endif
833 if (RT_SUCCESS(rc))
834 {
835 rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
836 if (RT_SUCCESS(rc))
837 {
838 *ppMem = &pMemLnx->Core;
839 IPRT_LINUX_RESTORE_EFL_AC();
840 return rc;
841 }
842
843 rtR0MemObjLinuxFreePages(pMemLnx);
844 rtR0MemObjDelete(&pMemLnx->Core);
845 }
846
847 IPRT_LINUX_RESTORE_EFL_AC();
848 return rc;
849}
850
851
852DECLHIDDEN(int) rtR0MemObjNativeAllocLarge(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, size_t cbLargePage, uint32_t fFlags,
853 const char *pszTag)
854{
855#ifdef GFP_TRANSHUGE
856 /*
857 * Allocate a memory object structure that's large enough to contain
858 * the page pointer array.
859 */
860# ifdef __GFP_MOVABLE
861 unsigned const fGfp = (GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE;
862# else
863 unsigned const fGfp = (GFP_TRANSHUGE | __GFP_ZERO);
864# endif
865 size_t const cPagesPerLarge = cbLargePage >> PAGE_SHIFT;
866 unsigned const cLargePageOrder = rtR0MemObjLinuxOrder(cPagesPerLarge);
867 size_t const cLargePages = cb >> (cLargePageOrder + PAGE_SHIFT);
868 size_t const cPages = cb >> PAGE_SHIFT;
869 PRTR0MEMOBJLNX pMemLnx;
870
871 Assert(RT_BIT_64(cLargePageOrder + PAGE_SHIFT) == cbLargePage);
872 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]),
873 RTR0MEMOBJTYPE_LARGE_PAGE, NULL, cb, pszTag);
874 if (pMemLnx)
875 {
876 size_t iLargePage;
877
878 pMemLnx->Core.fFlags |= RTR0MEMOBJ_FLAGS_ZERO_AT_ALLOC;
879 pMemLnx->cLargePageOrder = cLargePageOrder;
880 pMemLnx->cPages = cPages;
881
882 /*
883 * Allocate the requested number of large pages.
884 */
885 for (iLargePage = 0; iLargePage < cLargePages; iLargePage++)
886 {
887 struct page *paPages = alloc_pages(fGfp, cLargePageOrder);
888 if (paPages)
889 {
890 size_t const iPageBase = iLargePage << cLargePageOrder;
891 size_t iPage = cPagesPerLarge;
892 while (iPage-- > 0)
893 pMemLnx->apPages[iPageBase + iPage] = &paPages[iPage];
894 }
895 else
896 {
897 /*Log(("rtR0MemObjNativeAllocLarge: cb=%#zx cPages=%#zx cLargePages=%#zx cLargePageOrder=%u cPagesPerLarge=%#zx iLargePage=%#zx -> failed!\n",
898 cb, cPages, cLargePages, cLargePageOrder, cPagesPerLarge, iLargePage, paPages));*/
899 while (iLargePage-- > 0)
900 __free_pages(pMemLnx->apPages[iLargePage << (cLargePageOrder - PAGE_SHIFT)], cLargePageOrder);
901 rtR0MemObjDelete(&pMemLnx->Core);
902 return VERR_NO_MEMORY;
903 }
904 }
905 *ppMem = &pMemLnx->Core;
906 return VINF_SUCCESS;
907 }
908 return VERR_NO_MEMORY;
909
910#else
911 /*
912 * We don't call rtR0MemObjFallbackAllocLarge here as it can be a really
913 * bad idea to trigger the swap daemon and whatnot. So, just fail.
914 */
915 RT_NOREF(ppMem, cb, cbLargePage, fFlags, pszTag);
916 return VERR_NOT_SUPPORTED;
917#endif
918}
919
920
921DECLHIDDEN(int) rtR0MemObjNativeAllocLow(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
922{
923 IPRT_LINUX_SAVE_EFL_AC();
924 PRTR0MEMOBJLNX pMemLnx;
925 int rc;
926
927 /* Try to avoid GFP_DMA. GFM_DMA32 was introduced with Linux 2.6.15. */
928#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
929 /* ZONE_DMA32: 0-4GB */
930 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA32,
931 false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
932 if (RT_FAILURE(rc))
933#endif
934#ifdef RT_ARCH_AMD64
935 /* ZONE_DMA: 0-16MB */
936 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_DMA,
937 false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
938#else
939# ifdef CONFIG_X86_PAE
940# endif
941 /* ZONE_NORMAL: 0-896MB */
942 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_LOW, cb, PAGE_SIZE, GFP_USER,
943 false /* non-contiguous */, fExecutable, VERR_NO_LOW_MEMORY, pszTag);
944#endif
945 if (RT_SUCCESS(rc))
946 {
947 rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
948 if (RT_SUCCESS(rc))
949 {
950 *ppMem = &pMemLnx->Core;
951 IPRT_LINUX_RESTORE_EFL_AC();
952 return rc;
953 }
954
955 rtR0MemObjLinuxFreePages(pMemLnx);
956 rtR0MemObjDelete(&pMemLnx->Core);
957 }
958
959 IPRT_LINUX_RESTORE_EFL_AC();
960 return rc;
961}
962
963
964DECLHIDDEN(int) rtR0MemObjNativeAllocCont(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, bool fExecutable, const char *pszTag)
965{
966 IPRT_LINUX_SAVE_EFL_AC();
967 PRTR0MEMOBJLNX pMemLnx;
968 int rc;
969
970#if (defined(RT_ARCH_AMD64) || defined(CONFIG_X86_PAE)) && defined(GFP_DMA32)
971 /* ZONE_DMA32: 0-4GB */
972 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA32,
973 true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
974 if (RT_FAILURE(rc))
975#endif
976#ifdef RT_ARCH_AMD64
977 /* ZONE_DMA: 0-16MB */
978 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_DMA,
979 true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
980#else
981 /* ZONE_NORMAL (32-bit hosts): 0-896MB */
982 rc = rtR0MemObjLinuxAllocPages(&pMemLnx, RTR0MEMOBJTYPE_CONT, cb, PAGE_SIZE, GFP_USER,
983 true /* contiguous */, fExecutable, VERR_NO_CONT_MEMORY, pszTag);
984#endif
985 if (RT_SUCCESS(rc))
986 {
987 rc = rtR0MemObjLinuxVMap(pMemLnx, fExecutable);
988 if (RT_SUCCESS(rc))
989 {
990#if defined(RT_STRICT) && (defined(RT_ARCH_AMD64) || defined(CONFIG_HIGHMEM64G))
991 size_t iPage = pMemLnx->cPages;
992 while (iPage-- > 0)
993 Assert(page_to_phys(pMemLnx->apPages[iPage]) < _4G);
994#endif
995 pMemLnx->Core.u.Cont.Phys = page_to_phys(pMemLnx->apPages[0]);
996 *ppMem = &pMemLnx->Core;
997 IPRT_LINUX_RESTORE_EFL_AC();
998 return rc;
999 }
1000
1001 rtR0MemObjLinuxFreePages(pMemLnx);
1002 rtR0MemObjDelete(&pMemLnx->Core);
1003 }
1004
1005 IPRT_LINUX_RESTORE_EFL_AC();
1006 return rc;
1007}
1008
1009
1010/**
1011 * Worker for rtR0MemObjLinuxAllocPhysSub that tries one allocation strategy.
1012 *
1013 * @returns IPRT status code.
1014 * @param ppMemLnx Where to
1015 * @param enmType The object type.
1016 * @param cb The size of the allocation.
1017 * @param uAlignment The alignment of the physical memory.
1018 * Only valid for fContiguous == true, ignored otherwise.
1019 * @param PhysHighest See rtR0MemObjNativeAllocPhys.
1020 * @param pszTag Allocation tag used for statistics and such.
1021 * @param fGfp The Linux GFP flags to use for the allocation.
1022 */
1023static int rtR0MemObjLinuxAllocPhysSub2(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType,
1024 size_t cb, size_t uAlignment, RTHCPHYS PhysHighest, const char *pszTag, gfp_t fGfp)
1025{
1026 PRTR0MEMOBJLNX pMemLnx;
1027 int rc = rtR0MemObjLinuxAllocPages(&pMemLnx, enmType, cb, uAlignment, fGfp,
1028 enmType == RTR0MEMOBJTYPE_PHYS /* contiguous / non-contiguous */,
1029 false /*fExecutable*/, VERR_NO_PHYS_MEMORY, pszTag);
1030 if (RT_FAILURE(rc))
1031 return rc;
1032
1033 /*
1034 * Check the addresses if necessary. (Can be optimized a bit for PHYS.)
1035 */
1036 if (PhysHighest != NIL_RTHCPHYS)
1037 {
1038 size_t iPage = pMemLnx->cPages;
1039 while (iPage-- > 0)
1040 if (page_to_phys(pMemLnx->apPages[iPage]) > PhysHighest)
1041 {
1042 rtR0MemObjLinuxFreePages(pMemLnx);
1043 rtR0MemObjDelete(&pMemLnx->Core);
1044 return VERR_NO_MEMORY;
1045 }
1046 }
1047
1048 /*
1049 * Complete the object.
1050 */
1051 if (enmType == RTR0MEMOBJTYPE_PHYS)
1052 {
1053 pMemLnx->Core.u.Phys.PhysBase = page_to_phys(pMemLnx->apPages[0]);
1054 pMemLnx->Core.u.Phys.fAllocated = true;
1055 }
1056 *ppMem = &pMemLnx->Core;
1057 return rc;
1058}
1059
1060
1061/**
1062 * Worker for rtR0MemObjNativeAllocPhys and rtR0MemObjNativeAllocPhysNC.
1063 *
1064 * @returns IPRT status code.
1065 * @param ppMem Where to store the memory object pointer on success.
1066 * @param enmType The object type.
1067 * @param cb The size of the allocation.
1068 * @param uAlignment The alignment of the physical memory.
1069 * Only valid for enmType == RTR0MEMOBJTYPE_PHYS, ignored otherwise.
1070 * @param PhysHighest See rtR0MemObjNativeAllocPhys.
1071 * @param pszTag Allocation tag used for statistics and such.
1072 */
1073static int rtR0MemObjLinuxAllocPhysSub(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType,
1074 size_t cb, size_t uAlignment, RTHCPHYS PhysHighest, const char *pszTag)
1075{
1076 int rc;
1077 IPRT_LINUX_SAVE_EFL_AC();
1078
1079 /*
1080 * There are two clear cases and that's the <=16MB and anything-goes ones.
1081 * When the physical address limit is somewhere in-between those two we'll
1082 * just have to try, starting with HIGHUSER and working our way thru the
1083 * different types, hoping we'll get lucky.
1084 *
1085 * We should probably move this physical address restriction logic up to
1086 * the page alloc function as it would be more efficient there. But since
1087 * we don't expect this to be a performance issue just yet it can wait.
1088 */
1089 if (PhysHighest == NIL_RTHCPHYS)
1090 /* ZONE_HIGHMEM: the whole physical memory */
1091 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_HIGHUSER);
1092 else if (PhysHighest <= _1M * 16)
1093 /* ZONE_DMA: 0-16MB */
1094 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA);
1095 else
1096 {
1097 rc = VERR_NO_MEMORY;
1098 if (RT_FAILURE(rc))
1099 /* ZONE_HIGHMEM: the whole physical memory */
1100 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_HIGHUSER);
1101 if (RT_FAILURE(rc))
1102 /* ZONE_NORMAL: 0-896MB */
1103 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_USER);
1104#ifdef GFP_DMA32
1105 if (RT_FAILURE(rc))
1106 /* ZONE_DMA32: 0-4GB */
1107 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA32);
1108#endif
1109 if (RT_FAILURE(rc))
1110 /* ZONE_DMA: 0-16MB */
1111 rc = rtR0MemObjLinuxAllocPhysSub2(ppMem, enmType, cb, uAlignment, PhysHighest, pszTag, GFP_DMA);
1112 }
1113 IPRT_LINUX_RESTORE_EFL_AC();
1114 return rc;
1115}
1116
1117
1118/**
1119 * Translates a kernel virtual address to a linux page structure by walking the
1120 * page tables.
1121 *
1122 * @note We do assume that the page tables will not change as we are walking
1123 * them. This assumption is rather forced by the fact that I could not
1124 * immediately see any way of preventing this from happening. So, we
1125 * take some extra care when accessing them.
1126 *
1127 * Because of this, we don't want to use this function on memory where
1128 * attribute changes to nearby pages is likely to cause large pages to
1129 * be used or split up. So, don't use this for the linear mapping of
1130 * physical memory.
1131 *
1132 * @returns Pointer to the page structur or NULL if it could not be found.
1133 * @param pv The kernel virtual address.
1134 */
1135RTDECL(struct page *) rtR0MemObjLinuxVirtToPage(void *pv)
1136{
1137 unsigned long ulAddr = (unsigned long)pv;
1138 unsigned long pfn;
1139 struct page *pPage;
1140 pte_t *pEntry;
1141 union
1142 {
1143 pgd_t Global;
1144#if RTLNX_VER_MIN(4,12,0)
1145 p4d_t Four;
1146#endif
1147#if RTLNX_VER_MIN(2,6,11)
1148 pud_t Upper;
1149#endif
1150 pmd_t Middle;
1151 pte_t Entry;
1152 } u;
1153
1154 /* Should this happen in a situation this code will be called in? And if
1155 * so, can it change under our feet? See also
1156 * "Documentation/vm/active_mm.txt" in the kernel sources. */
1157 if (RT_UNLIKELY(!current->active_mm))
1158 return NULL;
1159 u.Global = *pgd_offset(current->active_mm, ulAddr);
1160 if (RT_UNLIKELY(pgd_none(u.Global)))
1161 return NULL;
1162#if RTLNX_VER_MIN(2,6,11)
1163# if RTLNX_VER_MIN(4,12,0)
1164 u.Four = *p4d_offset(&u.Global, ulAddr);
1165 if (RT_UNLIKELY(p4d_none(u.Four)))
1166 return NULL;
1167 if (p4d_large(u.Four))
1168 {
1169 pPage = p4d_page(u.Four);
1170 AssertReturn(pPage, NULL);
1171 pfn = page_to_pfn(pPage); /* doing the safe way... */
1172 AssertCompile(P4D_SHIFT - PAGE_SHIFT < 31);
1173 pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (P4D_SHIFT - PAGE_SHIFT)) - 1);
1174 return pfn_to_page(pfn);
1175 }
1176 u.Upper = *pud_offset(&u.Four, ulAddr);
1177# else /* < 4.12 */
1178 u.Upper = *pud_offset(&u.Global, ulAddr);
1179# endif /* < 4.12 */
1180 if (RT_UNLIKELY(pud_none(u.Upper)))
1181 return NULL;
1182# if RTLNX_VER_MIN(2,6,25)
1183 if (pud_large(u.Upper))
1184 {
1185 pPage = pud_page(u.Upper);
1186 AssertReturn(pPage, NULL);
1187 pfn = page_to_pfn(pPage); /* doing the safe way... */
1188 pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PUD_SHIFT - PAGE_SHIFT)) - 1);
1189 return pfn_to_page(pfn);
1190 }
1191# endif
1192 u.Middle = *pmd_offset(&u.Upper, ulAddr);
1193#else /* < 2.6.11 */
1194 u.Middle = *pmd_offset(&u.Global, ulAddr);
1195#endif /* < 2.6.11 */
1196 if (RT_UNLIKELY(pmd_none(u.Middle)))
1197 return NULL;
1198#if RTLNX_VER_MIN(2,6,0)
1199 if (pmd_large(u.Middle))
1200 {
1201 pPage = pmd_page(u.Middle);
1202 AssertReturn(pPage, NULL);
1203 pfn = page_to_pfn(pPage); /* doing the safe way... */
1204 pfn += (ulAddr >> PAGE_SHIFT) & ((UINT32_C(1) << (PMD_SHIFT - PAGE_SHIFT)) - 1);
1205 return pfn_to_page(pfn);
1206 }
1207#endif
1208
1209#if RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map) /* As usual, RHEL 3 had pte_offset_map earlier. */
1210 pEntry = pte_offset_map(&u.Middle, ulAddr);
1211#else
1212 pEntry = pte_offset(&u.Middle, ulAddr);
1213#endif
1214 if (RT_UNLIKELY(!pEntry))
1215 return NULL;
1216 u.Entry = *pEntry;
1217#if RTLNX_VER_MIN(2,5,5) || defined(pte_offset_map)
1218 pte_unmap(pEntry);
1219#endif
1220
1221 if (RT_UNLIKELY(!pte_present(u.Entry)))
1222 return NULL;
1223 return pte_page(u.Entry);
1224}
1225RT_EXPORT_SYMBOL(rtR0MemObjLinuxVirtToPage);
1226
1227
1228DECLHIDDEN(int) rtR0MemObjNativeAllocPhys(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest, size_t uAlignment,
1229 const char *pszTag)
1230{
1231 return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS, cb, uAlignment, PhysHighest, pszTag);
1232}
1233
1234
1235DECLHIDDEN(int) rtR0MemObjNativeAllocPhysNC(PPRTR0MEMOBJINTERNAL ppMem, size_t cb, RTHCPHYS PhysHighest, const char *pszTag)
1236{
1237 return rtR0MemObjLinuxAllocPhysSub(ppMem, RTR0MEMOBJTYPE_PHYS_NC, cb, PAGE_SIZE, PhysHighest, pszTag);
1238}
1239
1240
1241DECLHIDDEN(int) rtR0MemObjNativeEnterPhys(PPRTR0MEMOBJINTERNAL ppMem, RTHCPHYS Phys, size_t cb, uint32_t uCachePolicy,
1242 const char *pszTag)
1243{
1244 IPRT_LINUX_SAVE_EFL_AC();
1245
1246 /*
1247 * All we need to do here is to validate that we can use
1248 * ioremap on the specified address (32/64-bit dma_addr_t).
1249 */
1250 PRTR0MEMOBJLNX pMemLnx;
1251 dma_addr_t PhysAddr = Phys;
1252 AssertMsgReturn(PhysAddr == Phys, ("%#llx\n", (unsigned long long)Phys), VERR_ADDRESS_TOO_BIG);
1253
1254 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_PHYS, NULL, cb, pszTag);
1255 if (!pMemLnx)
1256 {
1257 IPRT_LINUX_RESTORE_EFL_AC();
1258 return VERR_NO_MEMORY;
1259 }
1260
1261 pMemLnx->Core.u.Phys.PhysBase = PhysAddr;
1262 pMemLnx->Core.u.Phys.fAllocated = false;
1263 pMemLnx->Core.u.Phys.uCachePolicy = uCachePolicy;
1264 Assert(!pMemLnx->cPages);
1265 *ppMem = &pMemLnx->Core;
1266 IPRT_LINUX_RESTORE_EFL_AC();
1267 return VINF_SUCCESS;
1268}
1269
1270/* openSUSE Leap 42.3 detection :-/ */
1271#if RTLNX_VER_RANGE(4,4,0, 4,6,0) && defined(FAULT_FLAG_REMOTE)
1272# define GET_USER_PAGES_API KERNEL_VERSION(4, 10, 0) /* no typo! */
1273#else
1274# define GET_USER_PAGES_API LINUX_VERSION_CODE
1275#endif
1276
1277DECLHIDDEN(int) rtR0MemObjNativeLockUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3Ptr, size_t cb, uint32_t fAccess,
1278 RTR0PROCESS R0Process, const char *pszTag)
1279{
1280 IPRT_LINUX_SAVE_EFL_AC();
1281 const int cPages = cb >> PAGE_SHIFT;
1282 struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
1283 struct vm_area_struct **papVMAs;
1284 PRTR0MEMOBJLNX pMemLnx;
1285 int rc = VERR_NO_MEMORY;
1286 int const fWrite = fAccess & RTMEM_PROT_WRITE ? 1 : 0;
1287
1288 /*
1289 * Check for valid task and size overflows.
1290 */
1291 if (!pTask)
1292 return VERR_NOT_SUPPORTED;
1293 if (((size_t)cPages << PAGE_SHIFT) != cb)
1294 return VERR_OUT_OF_RANGE;
1295
1296 /*
1297 * Allocate the memory object and a temporary buffer for the VMAs.
1298 */
1299 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK,
1300 (void *)R3Ptr, cb, pszTag);
1301 if (!pMemLnx)
1302 {
1303 IPRT_LINUX_RESTORE_EFL_AC();
1304 return VERR_NO_MEMORY;
1305 }
1306
1307 papVMAs = (struct vm_area_struct **)RTMemAlloc(sizeof(*papVMAs) * cPages);
1308 if (papVMAs)
1309 {
1310 LNX_MM_DOWN_READ(pTask->mm);
1311
1312 /*
1313 * Get user pages.
1314 */
1315/** @todo r=bird: Should we not force read access too? */
1316#if GET_USER_PAGES_API >= KERNEL_VERSION(4, 6, 0)
1317 if (R0Process == RTR0ProcHandleSelf())
1318 rc = get_user_pages(R3Ptr, /* Where from. */
1319 cPages, /* How many pages. */
1320# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0)
1321 fWrite ? FOLL_WRITE | /* Write to memory. */
1322 FOLL_FORCE /* force write access. */
1323 : 0, /* Write to memory. */
1324# else
1325 fWrite, /* Write to memory. */
1326 fWrite, /* force write access. */
1327# endif
1328 &pMemLnx->apPages[0], /* Page array. */
1329 papVMAs); /* vmas */
1330 /*
1331 * Actually this should not happen at the moment as call this function
1332 * only for our own process.
1333 */
1334 else
1335 rc = get_user_pages_remote(
1336# if GET_USER_PAGES_API < KERNEL_VERSION(5, 9, 0)
1337 pTask, /* Task for fault accounting. */
1338# endif
1339 pTask->mm, /* Whose pages. */
1340 R3Ptr, /* Where from. */
1341 cPages, /* How many pages. */
1342# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 9, 0)
1343 fWrite ? FOLL_WRITE | /* Write to memory. */
1344 FOLL_FORCE /* force write access. */
1345 : 0, /* Write to memory. */
1346# else
1347 fWrite, /* Write to memory. */
1348 fWrite, /* force write access. */
1349# endif
1350 &pMemLnx->apPages[0], /* Page array. */
1351 papVMAs /* vmas */
1352# if GET_USER_PAGES_API >= KERNEL_VERSION(4, 10, 0)
1353 , NULL /* locked */
1354# endif
1355 );
1356#else /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */
1357 rc = get_user_pages(pTask, /* Task for fault accounting. */
1358 pTask->mm, /* Whose pages. */
1359 R3Ptr, /* Where from. */
1360 cPages, /* How many pages. */
1361/* The get_user_pages API change was back-ported to 4.4.168. */
1362# if RTLNX_VER_RANGE(4,4,168, 4,5,0)
1363 fWrite ? FOLL_WRITE | /* Write to memory. */
1364 FOLL_FORCE /* force write access. */
1365 : 0, /* Write to memory. */
1366# else
1367 fWrite, /* Write to memory. */
1368 fWrite, /* force write access. */
1369# endif
1370 &pMemLnx->apPages[0], /* Page array. */
1371 papVMAs); /* vmas */
1372#endif /* GET_USER_PAGES_API < KERNEL_VERSION(4, 6, 0) */
1373 if (rc == cPages)
1374 {
1375 /*
1376 * Flush dcache (required?), protect against fork and _really_ pin the page
1377 * table entries. get_user_pages() will protect against swapping out the
1378 * pages but it will NOT protect against removing page table entries. This
1379 * can be achieved with
1380 * - using mlock / mmap(..., MAP_LOCKED, ...) from userland. This requires
1381 * an appropriate limit set up with setrlimit(..., RLIMIT_MEMLOCK, ...).
1382 * Usual Linux distributions support only a limited size of locked pages
1383 * (e.g. 32KB).
1384 * - setting the PageReserved bit (as we do in rtR0MemObjLinuxAllocPages()
1385 * or by
1386 * - setting the VM_LOCKED flag. This is the same as doing mlock() without
1387 * a range check.
1388 */
1389 /** @todo The Linux fork() protection will require more work if this API
1390 * is to be used for anything but locking VM pages. */
1391 while (rc-- > 0)
1392 {
1393 flush_dcache_page(pMemLnx->apPages[rc]);
1394 papVMAs[rc]->vm_flags |= VM_DONTCOPY | VM_LOCKED;
1395 }
1396
1397 LNX_MM_UP_READ(pTask->mm);
1398
1399 RTMemFree(papVMAs);
1400
1401 pMemLnx->Core.u.Lock.R0Process = R0Process;
1402 pMemLnx->cPages = cPages;
1403 Assert(!pMemLnx->fMappedToRing0);
1404 *ppMem = &pMemLnx->Core;
1405
1406 IPRT_LINUX_RESTORE_EFL_AC();
1407 return VINF_SUCCESS;
1408 }
1409
1410 /*
1411 * Failed - we need to unlock any pages that we succeeded to lock.
1412 */
1413 while (rc-- > 0)
1414 {
1415 if (!PageReserved(pMemLnx->apPages[rc]))
1416 SetPageDirty(pMemLnx->apPages[rc]);
1417#if RTLNX_VER_MIN(4,6,0)
1418 put_page(pMemLnx->apPages[rc]);
1419#else
1420 page_cache_release(pMemLnx->apPages[rc]);
1421#endif
1422 }
1423
1424 LNX_MM_UP_READ(pTask->mm);
1425
1426 RTMemFree(papVMAs);
1427 rc = VERR_LOCK_FAILED;
1428 }
1429
1430 rtR0MemObjDelete(&pMemLnx->Core);
1431 IPRT_LINUX_RESTORE_EFL_AC();
1432 return rc;
1433}
1434
1435
1436DECLHIDDEN(int) rtR0MemObjNativeLockKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pv, size_t cb, uint32_t fAccess, const char *pszTag)
1437{
1438 IPRT_LINUX_SAVE_EFL_AC();
1439 void *pvLast = (uint8_t *)pv + cb - 1;
1440 size_t const cPages = cb >> PAGE_SHIFT;
1441 PRTR0MEMOBJLNX pMemLnx;
1442 bool fLinearMapping;
1443 int rc;
1444 uint8_t *pbPage;
1445 size_t iPage;
1446 NOREF(fAccess);
1447
1448 if ( !RTR0MemKernelIsValidAddr(pv)
1449 || !RTR0MemKernelIsValidAddr(pv + cb))
1450 return VERR_INVALID_PARAMETER;
1451
1452 /*
1453 * The lower part of the kernel memory has a linear mapping between
1454 * physical and virtual addresses. So we take a short cut here. This is
1455 * assumed to be the cleanest way to handle those addresses (and the code
1456 * is well tested, though the test for determining it is not very nice).
1457 * If we ever decide it isn't we can still remove it.
1458 */
1459#if 0
1460 fLinearMapping = (unsigned long)pvLast < VMALLOC_START;
1461#else
1462 fLinearMapping = (unsigned long)pv >= (unsigned long)__va(0)
1463 && (unsigned long)pvLast < (unsigned long)high_memory;
1464#endif
1465
1466 /*
1467 * Allocate the memory object.
1468 */
1469 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(RT_UOFFSETOF_DYN(RTR0MEMOBJLNX, apPages[cPages]), RTR0MEMOBJTYPE_LOCK,
1470 pv, cb, pszTag);
1471 if (!pMemLnx)
1472 {
1473 IPRT_LINUX_RESTORE_EFL_AC();
1474 return VERR_NO_MEMORY;
1475 }
1476
1477 /*
1478 * Gather the pages.
1479 * We ASSUME all kernel pages are non-swappable and non-movable.
1480 */
1481 rc = VINF_SUCCESS;
1482 pbPage = (uint8_t *)pvLast;
1483 iPage = cPages;
1484 if (!fLinearMapping)
1485 {
1486 while (iPage-- > 0)
1487 {
1488 struct page *pPage = rtR0MemObjLinuxVirtToPage(pbPage);
1489 if (RT_UNLIKELY(!pPage))
1490 {
1491 rc = VERR_LOCK_FAILED;
1492 break;
1493 }
1494 pMemLnx->apPages[iPage] = pPage;
1495 pbPage -= PAGE_SIZE;
1496 }
1497 }
1498 else
1499 {
1500 while (iPage-- > 0)
1501 {
1502 pMemLnx->apPages[iPage] = virt_to_page(pbPage);
1503 pbPage -= PAGE_SIZE;
1504 }
1505 }
1506 if (RT_SUCCESS(rc))
1507 {
1508 /*
1509 * Complete the memory object and return.
1510 */
1511 pMemLnx->Core.u.Lock.R0Process = NIL_RTR0PROCESS;
1512 pMemLnx->cPages = cPages;
1513 Assert(!pMemLnx->fMappedToRing0);
1514 *ppMem = &pMemLnx->Core;
1515
1516 IPRT_LINUX_RESTORE_EFL_AC();
1517 return VINF_SUCCESS;
1518 }
1519
1520 rtR0MemObjDelete(&pMemLnx->Core);
1521 IPRT_LINUX_RESTORE_EFL_AC();
1522 return rc;
1523}
1524
1525
1526DECLHIDDEN(int) rtR0MemObjNativeReserveKernel(PPRTR0MEMOBJINTERNAL ppMem, void *pvFixed, size_t cb, size_t uAlignment,
1527 const char *pszTag)
1528{
1529#if RTLNX_VER_MIN(2,4,22)
1530 IPRT_LINUX_SAVE_EFL_AC();
1531 const size_t cPages = cb >> PAGE_SHIFT;
1532 struct page *pDummyPage;
1533 struct page **papPages;
1534
1535 /* check for unsupported stuff. */
1536 AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED);
1537 if (uAlignment > PAGE_SIZE)
1538 return VERR_NOT_SUPPORTED;
1539
1540 /*
1541 * Allocate a dummy page and create a page pointer array for vmap such that
1542 * the dummy page is mapped all over the reserved area.
1543 */
1544 pDummyPage = alloc_page(GFP_HIGHUSER | __GFP_NOWARN);
1545 if (pDummyPage)
1546 {
1547 papPages = RTMemAlloc(sizeof(*papPages) * cPages);
1548 if (papPages)
1549 {
1550 void *pv;
1551 size_t iPage = cPages;
1552 while (iPage-- > 0)
1553 papPages[iPage] = pDummyPage;
1554# ifdef VM_MAP
1555 pv = vmap(papPages, cPages, VM_MAP, PAGE_KERNEL_RO);
1556# else
1557 pv = vmap(papPages, cPages, VM_ALLOC, PAGE_KERNEL_RO);
1558# endif
1559 RTMemFree(papPages);
1560 if (pv)
1561 {
1562 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb, pszTag);
1563 if (pMemLnx)
1564 {
1565 pMemLnx->Core.u.ResVirt.R0Process = NIL_RTR0PROCESS;
1566 pMemLnx->cPages = 1;
1567 pMemLnx->apPages[0] = pDummyPage;
1568 *ppMem = &pMemLnx->Core;
1569 IPRT_LINUX_RESTORE_EFL_AC();
1570 return VINF_SUCCESS;
1571 }
1572 vunmap(pv);
1573 }
1574 }
1575 __free_page(pDummyPage);
1576 }
1577 IPRT_LINUX_RESTORE_EFL_AC();
1578 return VERR_NO_MEMORY;
1579
1580#else /* < 2.4.22 */
1581 /*
1582 * Could probably use ioremap here, but the caller is in a better position than us
1583 * to select some safe physical memory.
1584 */
1585 return VERR_NOT_SUPPORTED;
1586#endif
1587}
1588
1589
1590DECLHIDDEN(int) rtR0MemObjNativeReserveUser(PPRTR0MEMOBJINTERNAL ppMem, RTR3PTR R3PtrFixed, size_t cb, size_t uAlignment,
1591 RTR0PROCESS R0Process, const char *pszTag)
1592{
1593 IPRT_LINUX_SAVE_EFL_AC();
1594 PRTR0MEMOBJLNX pMemLnx;
1595 void *pv;
1596 struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
1597 if (!pTask)
1598 return VERR_NOT_SUPPORTED;
1599
1600 /*
1601 * Check that the specified alignment is supported.
1602 */
1603 if (uAlignment > PAGE_SIZE)
1604 return VERR_NOT_SUPPORTED;
1605
1606 /*
1607 * Let rtR0MemObjLinuxDoMmap do the difficult bits.
1608 */
1609 pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cb, uAlignment, pTask, RTMEM_PROT_NONE);
1610 if (pv == (void *)-1)
1611 {
1612 IPRT_LINUX_RESTORE_EFL_AC();
1613 return VERR_NO_MEMORY;
1614 }
1615
1616 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_RES_VIRT, pv, cb, pszTag);
1617 if (!pMemLnx)
1618 {
1619 rtR0MemObjLinuxDoMunmap(pv, cb, pTask);
1620 IPRT_LINUX_RESTORE_EFL_AC();
1621 return VERR_NO_MEMORY;
1622 }
1623
1624 pMemLnx->Core.u.ResVirt.R0Process = R0Process;
1625 *ppMem = &pMemLnx->Core;
1626 IPRT_LINUX_RESTORE_EFL_AC();
1627 return VINF_SUCCESS;
1628}
1629
1630
1631DECLHIDDEN(int) rtR0MemObjNativeMapKernel(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, void *pvFixed, size_t uAlignment,
1632 unsigned fProt, size_t offSub, size_t cbSub, const char *pszTag)
1633{
1634 int rc = VERR_NO_MEMORY;
1635 PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap;
1636 PRTR0MEMOBJLNX pMemLnx;
1637 IPRT_LINUX_SAVE_EFL_AC();
1638
1639 /* Fail if requested to do something we can't. */
1640 AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED);
1641 if (uAlignment > PAGE_SIZE)
1642 return VERR_NOT_SUPPORTED;
1643
1644 /*
1645 * Create the IPRT memory object.
1646 */
1647 if (!cbSub)
1648 cbSub = pMemLnxToMap->Core.cb - offSub;
1649 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub, pszTag);
1650 if (pMemLnx)
1651 {
1652 if (pMemLnxToMap->cPages)
1653 {
1654#if RTLNX_VER_MIN(2,4,22)
1655 /*
1656 * Use vmap - 2.4.22 and later.
1657 */
1658 pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, true /* kernel */);
1659 /** @todo We don't really care too much for EXEC here... 5.8 always adds NX. */
1660 Assert(((offSub + cbSub) >> PAGE_SHIFT) <= pMemLnxToMap->cPages);
1661# ifdef VM_MAP
1662 pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_MAP, fPg);
1663# else
1664 pMemLnx->Core.pv = vmap(&pMemLnxToMap->apPages[offSub >> PAGE_SHIFT], cbSub >> PAGE_SHIFT, VM_ALLOC, fPg);
1665# endif
1666 if (pMemLnx->Core.pv)
1667 {
1668 pMemLnx->fMappedToRing0 = true;
1669 rc = VINF_SUCCESS;
1670 }
1671 else
1672 rc = VERR_MAP_FAILED;
1673
1674#else /* < 2.4.22 */
1675 /*
1676 * Only option here is to share mappings if possible and forget about fProt.
1677 */
1678 if (rtR0MemObjIsRing3(pMemToMap))
1679 rc = VERR_NOT_SUPPORTED;
1680 else
1681 {
1682 rc = VINF_SUCCESS;
1683 if (!pMemLnxToMap->Core.pv)
1684 rc = rtR0MemObjLinuxVMap(pMemLnxToMap, !!(fProt & RTMEM_PROT_EXEC));
1685 if (RT_SUCCESS(rc))
1686 {
1687 Assert(pMemLnxToMap->Core.pv);
1688 pMemLnx->Core.pv = (uint8_t *)pMemLnxToMap->Core.pv + offSub;
1689 }
1690 }
1691#endif
1692 }
1693 else
1694 {
1695 /*
1696 * MMIO / physical memory.
1697 */
1698 Assert(pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS && !pMemLnxToMap->Core.u.Phys.fAllocated);
1699#if RTLNX_VER_MIN(2,6,25)
1700 /*
1701 * ioremap() defaults to no caching since the 2.6 kernels.
1702 * ioremap_nocache() has been removed finally in 5.6-rc1.
1703 */
1704 pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO
1705 ? ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub)
1706 : ioremap_cache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub);
1707#else /* KERNEL_VERSION < 2.6.25 */
1708 pMemLnx->Core.pv = pMemLnxToMap->Core.u.Phys.uCachePolicy == RTMEM_CACHE_POLICY_MMIO
1709 ? ioremap_nocache(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub)
1710 : ioremap(pMemLnxToMap->Core.u.Phys.PhysBase + offSub, cbSub);
1711#endif /* KERNEL_VERSION < 2.6.25 */
1712 if (pMemLnx->Core.pv)
1713 {
1714 /** @todo fix protection. */
1715 rc = VINF_SUCCESS;
1716 }
1717 }
1718 if (RT_SUCCESS(rc))
1719 {
1720 pMemLnx->Core.u.Mapping.R0Process = NIL_RTR0PROCESS;
1721 *ppMem = &pMemLnx->Core;
1722 IPRT_LINUX_RESTORE_EFL_AC();
1723 return VINF_SUCCESS;
1724 }
1725 rtR0MemObjDelete(&pMemLnx->Core);
1726 }
1727
1728 IPRT_LINUX_RESTORE_EFL_AC();
1729 return rc;
1730}
1731
1732
1733#ifdef VBOX_USE_PAE_HACK
1734/**
1735 * Replace the PFN of a PTE with the address of the actual page.
1736 *
1737 * The caller maps a reserved dummy page at the address with the desired access
1738 * and flags.
1739 *
1740 * This hack is required for older Linux kernels which don't provide
1741 * remap_pfn_range().
1742 *
1743 * @returns 0 on success, -ENOMEM on failure.
1744 * @param mm The memory context.
1745 * @param ulAddr The mapping address.
1746 * @param Phys The physical address of the page to map.
1747 */
1748static int rtR0MemObjLinuxFixPte(struct mm_struct *mm, unsigned long ulAddr, RTHCPHYS Phys)
1749{
1750 int rc = -ENOMEM;
1751 pgd_t *pgd;
1752
1753 spin_lock(&mm->page_table_lock);
1754
1755 pgd = pgd_offset(mm, ulAddr);
1756 if (!pgd_none(*pgd) && !pgd_bad(*pgd))
1757 {
1758 pmd_t *pmd = pmd_offset(pgd, ulAddr);
1759 if (!pmd_none(*pmd))
1760 {
1761 pte_t *ptep = pte_offset_map(pmd, ulAddr);
1762 if (ptep)
1763 {
1764 pte_t pte = *ptep;
1765 pte.pte_high &= 0xfff00000;
1766 pte.pte_high |= ((Phys >> 32) & 0x000fffff);
1767 pte.pte_low &= 0x00000fff;
1768 pte.pte_low |= (Phys & 0xfffff000);
1769 set_pte(ptep, pte);
1770 pte_unmap(ptep);
1771 rc = 0;
1772 }
1773 }
1774 }
1775
1776 spin_unlock(&mm->page_table_lock);
1777 return rc;
1778}
1779#endif /* VBOX_USE_PAE_HACK */
1780
1781
1782DECLHIDDEN(int) rtR0MemObjNativeMapUser(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, RTR3PTR R3PtrFixed, size_t uAlignment,
1783 unsigned fProt, RTR0PROCESS R0Process, size_t offSub, size_t cbSub, const char *pszTag)
1784{
1785 struct task_struct *pTask = rtR0ProcessToLinuxTask(R0Process);
1786 PRTR0MEMOBJLNX pMemLnxToMap = (PRTR0MEMOBJLNX)pMemToMap;
1787 int rc = VERR_NO_MEMORY;
1788 PRTR0MEMOBJLNX pMemLnx;
1789#ifdef VBOX_USE_PAE_HACK
1790 struct page *pDummyPage;
1791 RTHCPHYS DummyPhys;
1792#endif
1793 IPRT_LINUX_SAVE_EFL_AC();
1794
1795 /*
1796 * Check for restrictions.
1797 */
1798 if (!pTask)
1799 return VERR_NOT_SUPPORTED;
1800 if (uAlignment > PAGE_SIZE)
1801 return VERR_NOT_SUPPORTED;
1802
1803#ifdef VBOX_USE_PAE_HACK
1804 /*
1805 * Allocate a dummy page for use when mapping the memory.
1806 */
1807 pDummyPage = alloc_page(GFP_USER | __GFP_NOWARN);
1808 if (!pDummyPage)
1809 {
1810 IPRT_LINUX_RESTORE_EFL_AC();
1811 return VERR_NO_MEMORY;
1812 }
1813 SetPageReserved(pDummyPage);
1814 DummyPhys = page_to_phys(pDummyPage);
1815#endif
1816
1817 /*
1818 * Create the IPRT memory object.
1819 */
1820 Assert(!offSub || cbSub);
1821 if (cbSub == 0)
1822 cbSub = pMemLnxToMap->Core.cb;
1823 pMemLnx = (PRTR0MEMOBJLNX)rtR0MemObjNew(sizeof(*pMemLnx), RTR0MEMOBJTYPE_MAPPING, NULL, cbSub, pszTag);
1824 if (pMemLnx)
1825 {
1826 /*
1827 * Allocate user space mapping.
1828 */
1829 void *pv;
1830 pv = rtR0MemObjLinuxDoMmap(R3PtrFixed, cbSub, uAlignment, pTask, fProt);
1831 if (pv != (void *)-1)
1832 {
1833 /*
1834 * Map page by page into the mmap area.
1835 * This is generic, paranoid and not very efficient.
1836 */
1837 pgprot_t fPg = rtR0MemObjLinuxConvertProt(fProt, false /* user */);
1838 unsigned long ulAddrCur = (unsigned long)pv;
1839 const size_t cPages = (offSub + cbSub) >> PAGE_SHIFT;
1840 size_t iPage;
1841
1842 LNX_MM_DOWN_WRITE(pTask->mm);
1843
1844 rc = VINF_SUCCESS;
1845 if (pMemLnxToMap->cPages)
1846 {
1847 for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE)
1848 {
1849#if RTLNX_VER_MAX(2,6,11)
1850 RTHCPHYS Phys = page_to_phys(pMemLnxToMap->apPages[iPage]);
1851#endif
1852#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
1853 struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */
1854 AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR);
1855#endif
1856#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86)
1857 /* remap_page_range() limitation on x86 */
1858 AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY);
1859#endif
1860
1861#if defined(VBOX_USE_INSERT_PAGE) && RTLNX_VER_MIN(2,6,22)
1862 rc = vm_insert_page(vma, ulAddrCur, pMemLnxToMap->apPages[iPage]);
1863 /* Thes flags help making 100% sure some bad stuff wont happen (swap, core, ++).
1864 * See remap_pfn_range() in mm/memory.c */
1865#if RTLNX_VER_MIN(3,7,0)
1866 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
1867#else
1868 vma->vm_flags |= VM_RESERVED;
1869#endif
1870#elif RTLNX_VER_MIN(2,6,11)
1871 rc = remap_pfn_range(vma, ulAddrCur, page_to_pfn(pMemLnxToMap->apPages[iPage]), PAGE_SIZE, fPg);
1872#elif defined(VBOX_USE_PAE_HACK)
1873 rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg);
1874 if (!rc)
1875 rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys);
1876#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
1877 rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
1878#else /* 2.4 */
1879 rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg);
1880#endif
1881 if (rc)
1882 {
1883 rc = VERR_NO_MEMORY;
1884 break;
1885 }
1886 }
1887 }
1888 else
1889 {
1890 RTHCPHYS Phys;
1891 if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_PHYS)
1892 Phys = pMemLnxToMap->Core.u.Phys.PhysBase;
1893 else if (pMemLnxToMap->Core.enmType == RTR0MEMOBJTYPE_CONT)
1894 Phys = pMemLnxToMap->Core.u.Cont.Phys;
1895 else
1896 {
1897 AssertMsgFailed(("%d\n", pMemLnxToMap->Core.enmType));
1898 Phys = NIL_RTHCPHYS;
1899 }
1900 if (Phys != NIL_RTHCPHYS)
1901 {
1902 for (iPage = offSub >> PAGE_SHIFT; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE, Phys += PAGE_SIZE)
1903 {
1904#if RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
1905 struct vm_area_struct *vma = find_vma(pTask->mm, ulAddrCur); /* this is probably the same for all the pages... */
1906 AssertBreakStmt(vma, rc = VERR_INTERNAL_ERROR);
1907#endif
1908#if RTLNX_VER_MAX(2,6,0) && defined(RT_ARCH_X86)
1909 /* remap_page_range() limitation on x86 */
1910 AssertBreakStmt(Phys < _4G, rc = VERR_NO_MEMORY);
1911#endif
1912
1913#if RTLNX_VER_MIN(2,6,11)
1914 rc = remap_pfn_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
1915#elif defined(VBOX_USE_PAE_HACK)
1916 rc = remap_page_range(vma, ulAddrCur, DummyPhys, PAGE_SIZE, fPg);
1917 if (!rc)
1918 rc = rtR0MemObjLinuxFixPte(pTask->mm, ulAddrCur, Phys);
1919#elif RTLNX_VER_MIN(2,6,0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
1920 rc = remap_page_range(vma, ulAddrCur, Phys, PAGE_SIZE, fPg);
1921#else /* 2.4 */
1922 rc = remap_page_range(ulAddrCur, Phys, PAGE_SIZE, fPg);
1923#endif
1924 if (rc)
1925 {
1926 rc = VERR_NO_MEMORY;
1927 break;
1928 }
1929 }
1930 }
1931 }
1932
1933#ifdef CONFIG_NUMA_BALANCING
1934# if RTLNX_VER_MAX(3,13,0) && RTLNX_RHEL_MAX(7,0)
1935# define VBOX_NUMA_HACK_OLD
1936# endif
1937 if (RT_SUCCESS(rc))
1938 {
1939 /** @todo Ugly hack! But right now we have no other means to
1940 * disable automatic NUMA page balancing. */
1941# ifdef RT_OS_X86
1942# ifdef VBOX_NUMA_HACK_OLD
1943 pTask->mm->numa_next_reset = jiffies + 0x7fffffffUL;
1944# endif
1945 pTask->mm->numa_next_scan = jiffies + 0x7fffffffUL;
1946# else
1947# ifdef VBOX_NUMA_HACK_OLD
1948 pTask->mm->numa_next_reset = jiffies + 0x7fffffffffffffffUL;
1949# endif
1950 pTask->mm->numa_next_scan = jiffies + 0x7fffffffffffffffUL;
1951# endif
1952 }
1953#endif /* CONFIG_NUMA_BALANCING */
1954
1955 LNX_MM_UP_WRITE(pTask->mm);
1956
1957 if (RT_SUCCESS(rc))
1958 {
1959#ifdef VBOX_USE_PAE_HACK
1960 __free_page(pDummyPage);
1961#endif
1962 pMemLnx->Core.pv = pv;
1963 pMemLnx->Core.u.Mapping.R0Process = R0Process;
1964 *ppMem = &pMemLnx->Core;
1965 IPRT_LINUX_RESTORE_EFL_AC();
1966 return VINF_SUCCESS;
1967 }
1968
1969 /*
1970 * Bail out.
1971 */
1972 rtR0MemObjLinuxDoMunmap(pv, cbSub, pTask);
1973 }
1974 rtR0MemObjDelete(&pMemLnx->Core);
1975 }
1976#ifdef VBOX_USE_PAE_HACK
1977 __free_page(pDummyPage);
1978#endif
1979
1980 IPRT_LINUX_RESTORE_EFL_AC();
1981 return rc;
1982}
1983
1984
1985DECLHIDDEN(int) rtR0MemObjNativeProtect(PRTR0MEMOBJINTERNAL pMem, size_t offSub, size_t cbSub, uint32_t fProt)
1986{
1987# ifdef IPRT_USE_ALLOC_VM_AREA_FOR_EXEC
1988 /*
1989 * Currently only supported when we've got addresses PTEs from the kernel.
1990 */
1991 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
1992 if (pMemLnx->pArea && pMemLnx->papPtesForArea)
1993 {
1994 pgprot_t const fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/);
1995 size_t const cPages = (offSub + cbSub) >> PAGE_SHIFT;
1996 pte_t **papPtes = pMemLnx->papPtesForArea;
1997 size_t i;
1998
1999 for (i = offSub >> PAGE_SHIFT; i < cPages; i++)
2000 {
2001 set_pte(papPtes[i], mk_pte(pMemLnx->apPages[i], fPg));
2002 }
2003 preempt_disable();
2004 __flush_tlb_all();
2005 preempt_enable();
2006 return VINF_SUCCESS;
2007 }
2008# elif defined(IPRT_USE_APPLY_TO_PAGE_RANGE_FOR_EXEC)
2009 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
2010 if ( pMemLnx->fExecutable
2011 && pMemLnx->fMappedToRing0)
2012 {
2013 LNXAPPLYPGRANGE Args;
2014 Args.pMemLnx = pMemLnx;
2015 Args.fPg = rtR0MemObjLinuxConvertProt(fProt, true /*fKernel*/);
2016 int rcLnx = apply_to_page_range(current->active_mm, (unsigned long)pMemLnx->Core.pv + offSub, cbSub,
2017 rtR0MemObjLinuxApplyPageRange, (void *)&Args);
2018 if (rcLnx)
2019 return VERR_NOT_SUPPORTED;
2020
2021 return VINF_SUCCESS;
2022 }
2023# endif
2024
2025 NOREF(pMem);
2026 NOREF(offSub);
2027 NOREF(cbSub);
2028 NOREF(fProt);
2029 return VERR_NOT_SUPPORTED;
2030}
2031
2032
2033DECLHIDDEN(RTHCPHYS) rtR0MemObjNativeGetPagePhysAddr(PRTR0MEMOBJINTERNAL pMem, size_t iPage)
2034{
2035 PRTR0MEMOBJLNX pMemLnx = (PRTR0MEMOBJLNX)pMem;
2036
2037 if (pMemLnx->cPages)
2038 return page_to_phys(pMemLnx->apPages[iPage]);
2039
2040 switch (pMemLnx->Core.enmType)
2041 {
2042 case RTR0MEMOBJTYPE_CONT:
2043 return pMemLnx->Core.u.Cont.Phys + (iPage << PAGE_SHIFT);
2044
2045 case RTR0MEMOBJTYPE_PHYS:
2046 return pMemLnx->Core.u.Phys.PhysBase + (iPage << PAGE_SHIFT);
2047
2048 /* the parent knows */
2049 case RTR0MEMOBJTYPE_MAPPING:
2050 return rtR0MemObjNativeGetPagePhysAddr(pMemLnx->Core.uRel.Child.pParent, iPage);
2051
2052 /* cPages > 0 */
2053 case RTR0MEMOBJTYPE_LOW:
2054 case RTR0MEMOBJTYPE_LOCK:
2055 case RTR0MEMOBJTYPE_PHYS_NC:
2056 case RTR0MEMOBJTYPE_PAGE:
2057 case RTR0MEMOBJTYPE_LARGE_PAGE:
2058 default:
2059 AssertMsgFailed(("%d\n", pMemLnx->Core.enmType));
2060 RT_FALL_THROUGH();
2061
2062 case RTR0MEMOBJTYPE_RES_VIRT:
2063 return NIL_RTHCPHYS;
2064 }
2065}
2066
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette