Skip to content

Commit c8d78c1

Browse files
kiryltorvalds
authored andcommitted
mm: replace remap_file_pages() syscall with emulation
remap_file_pages(2) was invented to be able efficiently map parts of huge file into limited 32-bit virtual address space such as in database workloads. Nonlinear mappings are pain to support and it seems there's no legitimate use-cases nowadays since 64-bit systems are widely available. Let's drop it and get rid of all these special-cased code. The patch replaces the syscall with emulation which creates new VMA on each remap_file_pages(), unless they it can be merged with an adjacent one. I didn't find *any* real code that uses remap_file_pages(2) to test emulation impact on. I've checked Debian code search and source of all packages in ALT Linux. No real users: libc wrappers, mentions in strace, gdb, valgrind and this kind of stuff. There are few basic tests in LTP for the syscall. They work just fine with emulation. To test performance impact, I've written small test case which demonstrate pretty much worst case scenario: map 4G shmfs file, write to begin of every page pgoff of the page, remap pages in reverse order, read every page. The test creates 1 million of VMAs if emulation is in use, so I had to set vm.max_map_count to 1100000 to avoid -ENOMEM. Before: 23.3 ( +- 4.31% ) seconds After: 43.9 ( +- 0.85% ) seconds Slowdown: 1.88x I believe we can live with that. Test case: #define _GNU_SOURCE #include <assert.h> #include <stdlib.h> #include <stdio.h> #include <sys/mman.h> #define MB (1024UL * 1024) #define SIZE (4096 * MB) int main(int argc, char **argv) { unsigned long *p; long i, pass; for (pass = 0; pass < 10; pass++) { p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; for (i = 0; i < SIZE / 4096; i++) { if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096, 0, (SIZE - 4096 * (i + 1)) >> 12, 0)) { perror("remap_file_pages"); return -1; } } for (i = SIZE / 4096 - 1; i >= 0; i--) assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1); munmap(p, SIZE); } return 0; } [[email protected]: fix spello] [[email protected]: initialize populate before usage] [[email protected]: grab file ref to prevent race while mmaping] Signed-off-by: "Kirill A. Shutemov" <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Dave Jones <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Armin Rigo <[email protected]> Signed-off-by: Sasha Levin <[email protected]> Cc: Hugh Dickins <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 3c48687 commit c8d78c1

File tree

6 files changed

+79
-298
lines changed

6 files changed

+79
-298
lines changed

Documentation/vm/remap_file_pages.txt

+3-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit
1818
virtual address space. This use-case is not critical anymore since 64-bit
1919
systems are widely available.
2020

21-
The plan is to deprecate the syscall and replace it with an emulation.
22-
The emulation will create new VMAs instead of nonlinear mappings. It's
23-
going to work slower for rare users of remap_file_pages() but ABI is
24-
preserved.
21+
The syscall is deprecated and replaced it with an emulation now. The
22+
emulation creates new VMAs instead of nonlinear mappings. It's going to
23+
work slower for rare users of remap_file_pages() but ABI is preserved.
2524

2625
One side effect of emulation (apart from performance) is that user can hit
2726
vm.max_map_count limit more easily due to additional VMAs. See comment for

include/linux/fs.h

+6-2
Original file line numberDiff line numberDiff line change
@@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int);
24812481

24822482
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
24832483
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
2484-
extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
2485-
unsigned long size, pgoff_t pgoff);
2484+
static inline int generic_file_remap_pages(struct vm_area_struct *vma,
2485+
unsigned long addr, unsigned long size, pgoff_t pgoff)
2486+
{
2487+
BUG();
2488+
return 0;
2489+
}
24862490
int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
24872491
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
24882492
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);

mm/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
mmu-y := nommu.o
6-
mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
6+
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
77
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
88
vmalloc.o pagewalk.o pgtable-generic.o
99

mm/fremap.c

-283
This file was deleted.

mm/mmap.c

+69
Original file line numberDiff line numberDiff line change
@@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
26342634
return vm_munmap(addr, len);
26352635
}
26362636

2637+
2638+
/*
2639+
* Emulation of deprecated remap_file_pages() syscall.
2640+
*/
2641+
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2642+
unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
2643+
{
2644+
2645+
struct mm_struct *mm = current->mm;
2646+
struct vm_area_struct *vma;
2647+
unsigned long populate = 0;
2648+
unsigned long ret = -EINVAL;
2649+
struct file *file;
2650+
2651+
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
2652+
"See Documentation/vm/remap_file_pages.txt.\n",
2653+
current->comm, current->pid);
2654+
2655+
if (prot)
2656+
return ret;
2657+
start = start & PAGE_MASK;
2658+
size = size & PAGE_MASK;
2659+
2660+
if (start + size <= start)
2661+
return ret;
2662+
2663+
/* Does pgoff wrap? */
2664+
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2665+
return ret;
2666+
2667+
down_write(&mm->mmap_sem);
2668+
vma = find_vma(mm, start);
2669+
2670+
if (!vma || !(vma->vm_flags & VM_SHARED))
2671+
goto out;
2672+
2673+
if (start < vma->vm_start || start + size > vma->vm_end)
2674+
goto out;
2675+
2676+
if (pgoff == linear_page_index(vma, start)) {
2677+
ret = 0;
2678+
goto out;
2679+
}
2680+
2681+
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
2682+
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
2683+
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
2684+
2685+
flags &= MAP_NONBLOCK;
2686+
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
2687+
if (vma->vm_flags & VM_LOCKED) {
2688+
flags |= MAP_LOCKED;
2689+
/* drop PG_Mlocked flag for over-mapped range */
2690+
munlock_vma_pages_range(vma, start, start + size);
2691+
}
2692+
2693+
file = get_file(vma->vm_file);
2694+
ret = do_mmap_pgoff(vma->vm_file, start, size,
2695+
prot, flags, pgoff, &populate);
2696+
fput(file);
2697+
out:
2698+
up_write(&mm->mmap_sem);
2699+
if (populate)
2700+
mm_populate(ret, populate);
2701+
if (!IS_ERR_VALUE(ret))
2702+
ret = 0;
2703+
return ret;
2704+
}
2705+
26372706
static inline void verify_mm_writelocked(struct mm_struct *mm)
26382707
{
26392708
#ifdef CONFIG_DEBUG_VM

0 commit comments

Comments
 (0)