]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/jemalloc/src/pages.c
ZFS: MFV 2.0-rc1-ga00c61
[FreeBSD/FreeBSD.git] / contrib / jemalloc / src / pages.c
1 #define JEMALLOC_PAGES_C_
2 #include "jemalloc/internal/jemalloc_preamble.h"
3
4 #include "jemalloc/internal/pages.h"
5
6 #include "jemalloc/internal/jemalloc_internal_includes.h"
7
8 #include "jemalloc/internal/assert.h"
9 #include "jemalloc/internal/malloc_io.h"
10
11 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12 #include <sys/sysctl.h>
13 #ifdef __FreeBSD__
14 #include <vm/vm_param.h>
15 #endif
16 #endif
17
18 /******************************************************************************/
19 /* Data. */
20
21 /* Actual operating system page size, detected during bootstrap, <= PAGE. */
22 static size_t   os_page;
23
24 #ifndef _WIN32
25 #  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
26 #  define PAGES_PROT_DECOMMIT (PROT_NONE)
27 static int      mmap_flags;
28 #endif
29 static bool     os_overcommits;
30
31 const char *thp_mode_names[] = {
32         "default",
33         "always",
34         "never",
35         "not supported"
36 };
37 thp_mode_t opt_thp = THP_MODE_DEFAULT;
38 thp_mode_t init_system_thp_mode;
39
40 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
41 static bool pages_can_purge_lazy_runtime = true;
42
43 /******************************************************************************/
44 /*
45  * Function prototypes for static functions that are referenced prior to
46  * definition.
47  */
48
49 static void os_pages_unmap(void *addr, size_t size);
50
51 /******************************************************************************/
52
53 static void *
54 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
55         assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
56         assert(ALIGNMENT_CEILING(size, os_page) == size);
57         assert(size != 0);
58
59         if (os_overcommits) {
60                 *commit = true;
61         }
62
63         void *ret;
64 #ifdef _WIN32
65         /*
66          * If VirtualAlloc can't allocate at the given address when one is
67          * given, it fails and returns NULL.
68          */
69         ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
70             PAGE_READWRITE);
71 #else
72         /*
73          * We don't use MAP_FIXED here, because it can cause the *replacement*
74          * of existing mappings, and we only want to create new mappings.
75          */
76         {
77                 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
78
79                 ret = mmap(addr, size, prot, mmap_flags, -1, 0);
80         }
81         assert(ret != NULL);
82
83         if (ret == MAP_FAILED) {
84                 ret = NULL;
85         } else if (addr != NULL && ret != addr) {
86                 /*
87                  * We succeeded in mapping memory, but not in the right place.
88                  */
89                 os_pages_unmap(ret, size);
90                 ret = NULL;
91         }
92 #endif
93         assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
94             ret == addr));
95         return ret;
96 }
97
98 static void *
99 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
100     bool *commit) {
101         void *ret = (void *)((uintptr_t)addr + leadsize);
102
103         assert(alloc_size >= leadsize + size);
104 #ifdef _WIN32
105         os_pages_unmap(addr, alloc_size);
106         void *new_addr = os_pages_map(ret, size, PAGE, commit);
107         if (new_addr == ret) {
108                 return ret;
109         }
110         if (new_addr != NULL) {
111                 os_pages_unmap(new_addr, size);
112         }
113         return NULL;
114 #else
115         size_t trailsize = alloc_size - leadsize - size;
116
117         if (leadsize != 0) {
118                 os_pages_unmap(addr, leadsize);
119         }
120         if (trailsize != 0) {
121                 os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
122         }
123         return ret;
124 #endif
125 }
126
127 static void
128 os_pages_unmap(void *addr, size_t size) {
129         assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
130         assert(ALIGNMENT_CEILING(size, os_page) == size);
131
132 #ifdef _WIN32
133         if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
134 #else
135         if (munmap(addr, size) == -1)
136 #endif
137         {
138                 char buf[BUFERROR_BUF];
139
140                 buferror(get_errno(), buf, sizeof(buf));
141                 malloc_printf("<jemalloc>: Error in "
142 #ifdef _WIN32
143                     "VirtualFree"
144 #else
145                     "munmap"
146 #endif
147                     "(): %s\n", buf);
148                 if (opt_abort) {
149                         abort();
150                 }
151         }
152 }
153
154 static void *
155 pages_map_slow(size_t size, size_t alignment, bool *commit) {
156         size_t alloc_size = size + alignment - os_page;
157         /* Beware size_t wrap-around. */
158         if (alloc_size < size) {
159                 return NULL;
160         }
161
162         void *ret;
163         do {
164                 void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
165                 if (pages == NULL) {
166                         return NULL;
167                 }
168                 size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
169                     - (uintptr_t)pages;
170                 ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
171         } while (ret == NULL);
172
173         assert(ret != NULL);
174         assert(PAGE_ADDR2BASE(ret) == ret);
175         return ret;
176 }
177
178 void *
179 pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
180         assert(alignment >= PAGE);
181         assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
182
183 #if defined(__FreeBSD__) && defined(MAP_EXCL)
184         /*
185          * FreeBSD has mechanisms both to mmap at specific address without
186          * touching existing mappings, and to mmap with specific alignment.
187          */
188         {
189                 if (os_overcommits) {
190                         *commit = true;
191                 }
192
193                 int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
194                 int flags = mmap_flags;
195
196                 if (addr != NULL) {
197                         flags |= MAP_FIXED | MAP_EXCL;
198                 } else {
199                         unsigned alignment_bits = ffs_zu(alignment);
200                         assert(alignment_bits > 1);
201                         flags |= MAP_ALIGNED(alignment_bits - 1);
202                 }
203
204                 void *ret = mmap(addr, size, prot, flags, -1, 0);
205                 if (ret == MAP_FAILED) {
206                         ret = NULL;
207                 }
208
209                 return ret;
210         }
211 #endif
212         /*
213          * Ideally, there would be a way to specify alignment to mmap() (like
214          * NetBSD has), but in the absence of such a feature, we have to work
215          * hard to efficiently create aligned mappings.  The reliable, but
216          * slow method is to create a mapping that is over-sized, then trim the
217          * excess.  However, that always results in one or two calls to
218          * os_pages_unmap(), and it can leave holes in the process's virtual
219          * memory map if memory grows downward.
220          *
221          * Optimistically try mapping precisely the right amount before falling
222          * back to the slow method, with the expectation that the optimistic
223          * approach works most of the time.
224          */
225
226         void *ret = os_pages_map(addr, size, os_page, commit);
227         if (ret == NULL || ret == addr) {
228                 return ret;
229         }
230         assert(addr == NULL);
231         if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
232                 os_pages_unmap(ret, size);
233                 return pages_map_slow(size, alignment, commit);
234         }
235
236         assert(PAGE_ADDR2BASE(ret) == ret);
237         return ret;
238 }
239
240 void
241 pages_unmap(void *addr, size_t size) {
242         assert(PAGE_ADDR2BASE(addr) == addr);
243         assert(PAGE_CEILING(size) == size);
244
245         os_pages_unmap(addr, size);
246 }
247
248 static bool
249 pages_commit_impl(void *addr, size_t size, bool commit) {
250         assert(PAGE_ADDR2BASE(addr) == addr);
251         assert(PAGE_CEILING(size) == size);
252
253         if (os_overcommits) {
254                 return true;
255         }
256
257 #ifdef _WIN32
258         return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
259             PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
260 #else
261         {
262                 int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
263                 void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
264                     -1, 0);
265                 if (result == MAP_FAILED) {
266                         return true;
267                 }
268                 if (result != addr) {
269                         /*
270                          * We succeeded in mapping memory, but not in the right
271                          * place.
272                          */
273                         os_pages_unmap(result, size);
274                         return true;
275                 }
276                 return false;
277         }
278 #endif
279 }
280
281 bool
282 pages_commit(void *addr, size_t size) {
283         return pages_commit_impl(addr, size, true);
284 }
285
286 bool
287 pages_decommit(void *addr, size_t size) {
288         return pages_commit_impl(addr, size, false);
289 }
290
291 bool
292 pages_purge_lazy(void *addr, size_t size) {
293         assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
294         assert(PAGE_CEILING(size) == size);
295
296         if (!pages_can_purge_lazy) {
297                 return true;
298         }
299         if (!pages_can_purge_lazy_runtime) {
300                 /*
301                  * Built with lazy purge enabled, but detected it was not
302                  * supported on the current system.
303                  */
304                 return true;
305         }
306
307 #ifdef _WIN32
308         VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
309         return false;
310 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
311         return (madvise(addr, size,
312 #  ifdef MADV_FREE
313             MADV_FREE
314 #  else
315             JEMALLOC_MADV_FREE
316 #  endif
317             ) != 0);
318 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
319     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
320         return (madvise(addr, size, MADV_DONTNEED) != 0);
321 #else
322         not_reached();
323 #endif
324 }
325
326 bool
327 pages_purge_forced(void *addr, size_t size) {
328         assert(PAGE_ADDR2BASE(addr) == addr);
329         assert(PAGE_CEILING(size) == size);
330
331         if (!pages_can_purge_forced) {
332                 return true;
333         }
334
335 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
336     defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
337         return (madvise(addr, size, MADV_DONTNEED) != 0);
338 #elif defined(JEMALLOC_MAPS_COALESCE)
339         /* Try to overlay a new demand-zeroed mapping. */
340         return pages_commit(addr, size);
341 #else
342         not_reached();
343 #endif
344 }
345
346 static bool
347 pages_huge_impl(void *addr, size_t size, bool aligned) {
348         if (aligned) {
349                 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
350                 assert(HUGEPAGE_CEILING(size) == size);
351         }
352 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
353         return (madvise(addr, size, MADV_HUGEPAGE) != 0);
354 #else
355         return true;
356 #endif
357 }
358
359 bool
360 pages_huge(void *addr, size_t size) {
361         return pages_huge_impl(addr, size, true);
362 }
363
364 static bool
365 pages_huge_unaligned(void *addr, size_t size) {
366         return pages_huge_impl(addr, size, false);
367 }
368
369 static bool
370 pages_nohuge_impl(void *addr, size_t size, bool aligned) {
371         if (aligned) {
372                 assert(HUGEPAGE_ADDR2BASE(addr) == addr);
373                 assert(HUGEPAGE_CEILING(size) == size);
374         }
375
376 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
377         return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
378 #else
379         return false;
380 #endif
381 }
382
383 bool
384 pages_nohuge(void *addr, size_t size) {
385         return pages_nohuge_impl(addr, size, true);
386 }
387
388 static bool
389 pages_nohuge_unaligned(void *addr, size_t size) {
390         return pages_nohuge_impl(addr, size, false);
391 }
392
393 bool
394 pages_dontdump(void *addr, size_t size) {
395         assert(PAGE_ADDR2BASE(addr) == addr);
396         assert(PAGE_CEILING(size) == size);
397 #ifdef JEMALLOC_MADVISE_DONTDUMP
398         return madvise(addr, size, MADV_DONTDUMP) != 0;
399 #else
400         return false;
401 #endif
402 }
403
404 bool
405 pages_dodump(void *addr, size_t size) {
406         assert(PAGE_ADDR2BASE(addr) == addr);
407         assert(PAGE_CEILING(size) == size);
408 #ifdef JEMALLOC_MADVISE_DONTDUMP
409         return madvise(addr, size, MADV_DODUMP) != 0;
410 #else
411         return false;
412 #endif
413 }
414
415
416 static size_t
417 os_page_detect(void) {
418 #ifdef _WIN32
419         SYSTEM_INFO si;
420         GetSystemInfo(&si);
421         return si.dwPageSize;
422 #elif defined(__FreeBSD__)
423         /*
424          * This returns the value obtained from
425          * the auxv vector, avoiding a syscall.
426          */
427         return getpagesize();
428 #else
429         long result = sysconf(_SC_PAGESIZE);
430         if (result == -1) {
431                 return LG_PAGE;
432         }
433         return (size_t)result;
434 #endif
435 }
436
437 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
438 static bool
439 os_overcommits_sysctl(void) {
440         int vm_overcommit;
441         size_t sz;
442
443         sz = sizeof(vm_overcommit);
444 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
445         int mib[2];
446
447         mib[0] = CTL_VM;
448         mib[1] = VM_OVERCOMMIT;
449         if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
450                 return false; /* Error. */
451         }
452 #else
453         if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
454                 return false; /* Error. */
455         }
456 #endif
457
458         return ((vm_overcommit & 0x3) == 0);
459 }
460 #endif
461
462 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
463 /*
464  * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
465  * reentry during bootstrapping if another library has interposed system call
466  * wrappers.
467  */
468 static bool
469 os_overcommits_proc(void) {
470         int fd;
471         char buf[1];
472
473 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
474         #if defined(O_CLOEXEC)
475                 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
476                         O_CLOEXEC);
477         #else
478                 fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
479                 if (fd != -1) {
480                         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
481                 }
482         #endif
483 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
484         #if defined(O_CLOEXEC)
485                 fd = (int)syscall(SYS_openat,
486                         AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
487         #else
488                 fd = (int)syscall(SYS_openat,
489                         AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
490                 if (fd != -1) {
491                         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
492                 }
493         #endif
494 #else
495         #if defined(O_CLOEXEC)
496                 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
497         #else
498                 fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
499                 if (fd != -1) {
500                         fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
501                 }
502         #endif
503 #endif
504
505         if (fd == -1) {
506                 return false; /* Error. */
507         }
508
509         ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
510 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
511         syscall(SYS_close, fd);
512 #else
513         close(fd);
514 #endif
515
516         if (nread < 1) {
517                 return false; /* Error. */
518         }
519         /*
520          * /proc/sys/vm/overcommit_memory meanings:
521          * 0: Heuristic overcommit.
522          * 1: Always overcommit.
523          * 2: Never overcommit.
524          */
525         return (buf[0] == '0' || buf[0] == '1');
526 }
527 #endif
528
529 void
530 pages_set_thp_state (void *ptr, size_t size) {
531         if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
532                 return;
533         }
534         assert(opt_thp != thp_mode_not_supported &&
535             init_system_thp_mode != thp_mode_not_supported);
536
537         if (opt_thp == thp_mode_always
538             && init_system_thp_mode != thp_mode_never) {
539                 assert(init_system_thp_mode == thp_mode_default);
540                 pages_huge_unaligned(ptr, size);
541         } else if (opt_thp == thp_mode_never) {
542                 assert(init_system_thp_mode == thp_mode_default ||
543                     init_system_thp_mode == thp_mode_always);
544                 pages_nohuge_unaligned(ptr, size);
545         }
546 }
547
548 static void
549 init_thp_state(void) {
550         if (!have_madvise_huge) {
551                 if (metadata_thp_enabled() && opt_abort) {
552                         malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
553                         abort();
554                 }
555                 goto label_error;
556         }
557
558         static const char sys_state_madvise[] = "always [madvise] never\n";
559         static const char sys_state_always[] = "[always] madvise never\n";
560         static const char sys_state_never[] = "always madvise [never]\n";
561         char buf[sizeof(sys_state_madvise)];
562
563 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
564         int fd = (int)syscall(SYS_open,
565             "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
566 #else
567         int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
568 #endif
569         if (fd == -1) {
570                 goto label_error;
571         }
572
573         ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
574 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
575         syscall(SYS_close, fd);
576 #else
577         close(fd);
578 #endif
579
580         if (nread < 0) {
581                 goto label_error; 
582         }
583
584         if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
585                 init_system_thp_mode = thp_mode_default;
586         } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
587                 init_system_thp_mode = thp_mode_always;
588         } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
589                 init_system_thp_mode = thp_mode_never;
590         } else {
591                 goto label_error;
592         }
593         return;
594 label_error:
595         opt_thp = init_system_thp_mode = thp_mode_not_supported;
596 }
597
598 bool
599 pages_boot(void) {
600         os_page = os_page_detect();
601         if (os_page > PAGE) {
602                 malloc_write("<jemalloc>: Unsupported system page size\n");
603                 if (opt_abort) {
604                         abort();
605                 }
606                 return true;
607         }
608
609 #ifndef _WIN32
610         mmap_flags = MAP_PRIVATE | MAP_ANON;
611 #endif
612
613 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
614         os_overcommits = os_overcommits_sysctl();
615 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
616         os_overcommits = os_overcommits_proc();
617 #  ifdef MAP_NORESERVE
618         if (os_overcommits) {
619                 mmap_flags |= MAP_NORESERVE;
620         }
621 #  endif
622 #else
623         os_overcommits = false;
624 #endif
625
626         init_thp_state();
627
628 #ifdef __FreeBSD__
629         /*
630          * FreeBSD doesn't need the check; madvise(2) is known to work.
631          */
632 #else
633         /* Detect lazy purge runtime support. */
634         if (pages_can_purge_lazy) {
635                 bool committed = false;
636                 void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
637                 if (madv_free_page == NULL) {
638                         return true;
639                 }
640                 assert(pages_can_purge_lazy_runtime);
641                 if (pages_purge_lazy(madv_free_page, PAGE)) {
642                         pages_can_purge_lazy_runtime = false;
643                 }
644                 os_pages_unmap(madv_free_page, PAGE);
645         }
646 #endif
647
648         return false;
649 }