module/os/linux/spl/spl-kmem.c

   1 /*
   2  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
   3  *  Copyright (C) 2007 The Regents of the University of California.
   4  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
   5  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
   6  *  UCRL-CODE-235197
   7  *
   8  *  This file is part of the SPL, Solaris Porting Layer.
   9  *
  10  *  The SPL is free software; you can redistribute it and/or modify it
  11  *  under the terms of the GNU General Public License as published by the
  12  *  Free Software Foundation; either version 2 of the License, or (at your
  13  *  option) any later version.
  14  *
  15  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  16  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  17  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  18  *  for more details.
  19  *
  20  *  You should have received a copy of the GNU General Public License along
  21  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  22  */
  23
  24 #include <sys/debug.h>
  25 #include <sys/sysmacros.h>
  26 #include <sys/kmem.h>
  27 #include <sys/vmem.h>
  28
  29 /*
  30  * As a general rule kmem_alloc() allocations should be small, preferably
  31  * just a few pages since they must by physically contiguous.  Therefore, a
  32  * rate limited warning will be printed to the console for any kmem_alloc()
  33  * which exceeds a reasonable threshold.
  34  *
  35  * The default warning threshold is set to sixteen pages but capped at 64K to
  36  * accommodate systems using large pages.  This value was selected to be small
  37  * enough to ensure the largest allocations are quickly noticed and fixed.
  38  * But large enough to avoid logging any warnings when a allocation size is
  39  * larger than optimal but not a serious concern.  Since this value is tunable,
  40  * developers are encouraged to set it lower when testing so any new largish
  41  * allocations are quickly caught.  These warnings may be disabled by setting
  42  * the threshold to zero.
  43  */
  44 /* BEGIN CSTYLED */
  45 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
  46 module_param(spl_kmem_alloc_warn, uint, 0644);
  47 MODULE_PARM_DESC(spl_kmem_alloc_warn,
  48         "Warning threshold in bytes for a kmem_alloc()");
  49 EXPORT_SYMBOL(spl_kmem_alloc_warn);
  50
  51 /*
  52  * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
  53  * Allocations which are marginally smaller than this limit may succeed but
  54  * should still be avoided due to the expense of locating a contiguous range
  55  * of free pages.  Therefore, a maximum kmem size with reasonable safely
  56  * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
  57  * will quickly fail.  Vmem_alloc() allocations less than or equal to this
  58  * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
  59  */
  60 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
  61 module_param(spl_kmem_alloc_max, uint, 0644);
  62 MODULE_PARM_DESC(spl_kmem_alloc_max,
  63         "Maximum size in bytes for a kmem_alloc()");
  64 EXPORT_SYMBOL(spl_kmem_alloc_max);
  65 /* END CSTYLED */
  66
  67 int
  68 kmem_debugging(void)
  69 {
  70         return (0);
  71 }
  72 EXPORT_SYMBOL(kmem_debugging);
  73
  74 char *
  75 kmem_vasprintf(const char *fmt, va_list ap)
  76 {
  77         va_list aq;
  78         char *ptr;
  79
  80         do {
  81                 va_copy(aq, ap);
  82                 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
  83                 va_end(aq);
  84         } while (ptr == NULL);
  85
  86         return (ptr);
  87 }
  88 EXPORT_SYMBOL(kmem_vasprintf);
  89
  90 char *
  91 kmem_asprintf(const char *fmt, ...)
  92 {
  93         va_list ap;
  94         char *ptr;
  95
  96         do {
  97                 va_start(ap, fmt);
  98                 ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
  99                 va_end(ap);
 100         } while (ptr == NULL);
 101
 102         return (ptr);
 103 }
 104 EXPORT_SYMBOL(kmem_asprintf);
 105
 106 static char *
 107 __strdup(const char *str, int flags)
 108 {
 109         char *ptr;
 110         int n;
 111
 112         n = strlen(str);
 113         ptr = kmalloc(n + 1, kmem_flags_convert(flags));
 114         if (ptr)
 115                 memcpy(ptr, str, n + 1);
 116
 117         return (ptr);
 118 }
 119
 120 char *
 121 kmem_strdup(const char *str)
 122 {
 123         return (__strdup(str, KM_SLEEP));
 124 }
 125 EXPORT_SYMBOL(kmem_strdup);
 126
 127 void
 128 kmem_strfree(char *str)
 129 {
 130         kfree(str);
 131 }
 132 EXPORT_SYMBOL(kmem_strfree);
 133
 134 void *
 135 spl_kvmalloc(size_t size, gfp_t lflags)
 136 {
 137 #ifdef HAVE_KVMALLOC
 138         /*
 139          * GFP_KERNEL allocations can safely use kvmalloc which may
 140          * improve performance by avoiding a) high latency caused by
 141          * vmalloc's on-access allocation, b) performance loss due to
 142          * MMU memory address mapping and c) vmalloc locking overhead.
 143          * This has the side-effect that the slab statistics will
 144          * incorrectly report this as a vmem allocation, but that is
 145          * purely cosmetic.
 146          */
 147         if ((lflags & GFP_KERNEL) == GFP_KERNEL)
 148                 return (kvmalloc(size, lflags));
 149 #endif
 150
 151         gfp_t kmalloc_lflags = lflags;
 152
 153         if (size > PAGE_SIZE) {
 154                 /*
 155                  * We need to set __GFP_NOWARN here since spl_kvmalloc is not
 156                  * only called by spl_kmem_alloc_impl but can be called
 157                  * directly with custom lflags, too. In that case
 158                  * kmem_flags_convert does not get called, which would
 159                  * implicitly set __GFP_NOWARN.
 160                  */
 161                 kmalloc_lflags |= __GFP_NOWARN;
 162
 163                 /*
 164                  * N.B. __GFP_RETRY_MAYFAIL is supported only for large
 165                  * e (>32kB) allocations.
 166                  *
 167                  * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
 168                  * for !costly requests because there is no other way to tell
 169                  * the allocator that we want to fail rather than retry
 170                  * endlessly.
 171                  */
 172                 if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
 173                     (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
 174                         kmalloc_lflags |= __GFP_NORETRY;
 175                 }
 176         }
 177
 178         /*
 179          * We first try kmalloc - even for big sizes - and fall back to
 180          * spl_vmalloc if that fails.
 181          *
 182          * For non-__GFP-RECLAIM allocations we always stick to
 183          * kmalloc_node, and fail when kmalloc is not successful (returns
 184          * NULL).
 185          * We cannot fall back to spl_vmalloc in this case because spl_vmalloc
 186          * internally uses GPF_KERNEL allocations.
 187          */
 188         void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
 189         if (ptr || size <= PAGE_SIZE ||
 190             (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
 191                 return (ptr);
 192         }
 193
 194         return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
 195 }
 196
 197 /*
 198  * General purpose unified implementation of kmem_alloc(). It is an
 199  * amalgamation of Linux and Illumos allocator design. It should never be
 200  * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
 201  * relatively portable.  Consumers may only access this function through
 202  * wrappers that enforce the common flags to ensure portability.
 203  */
 204 inline void *
 205 spl_kmem_alloc_impl(size_t size, int flags, int node)
 206 {
 207         gfp_t lflags = kmem_flags_convert(flags);
 208         void *ptr;
 209
 210         /*
 211          * Log abnormally large allocations and rate limit the console output.
 212          * Allocations larger than spl_kmem_alloc_warn should be performed
 213          * through the vmem_alloc()/vmem_zalloc() interfaces.
 214          */
 215         if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
 216             !(flags & KM_VMEM)) {
 217                 printk(KERN_WARNING
 218                     "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
 219                     "https://github.com/openzfs/zfs/issues/new\n",
 220                     (unsigned long)size, flags);
 221                 dump_stack();
 222         }
 223
 224         /*
 225          * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
 226          * unlike kmem_alloc() with KM_SLEEP on Illumos.
 227          */
 228         do {
 229                 /*
 230                  * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
 231                  * is unsafe.  This must fail for all for kmem_alloc() and
 232                  * kmem_zalloc() callers.
 233                  *
 234                  * For vmem_alloc() and vmem_zalloc() callers it is permissible
 235                  * to use spl_vmalloc().  However, in general use of
 236                  * spl_vmalloc() is strongly discouraged because a global lock
 237                  * must be acquired.  Contention on this lock can significantly
 238                  * impact performance so frequently manipulating the virtual
 239                  * address space is strongly discouraged.
 240                  */
 241                 if (size > spl_kmem_alloc_max) {
 242                         if (flags & KM_VMEM) {
 243                                 ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
 244                         } else {
 245                                 return (NULL);
 246                         }
 247                 } else {
 248                         if (flags & KM_VMEM) {
 249                                 ptr = spl_kvmalloc(size, lflags);
 250                         } else {
 251                                 ptr = kmalloc_node(size, lflags, node);
 252                         }
 253                 }
 254
 255                 if (likely(ptr) || (flags & KM_NOSLEEP))
 256                         return (ptr);
 257
 258                 /*
 259                  * Try hard to satisfy the allocation. However, when progress
 260                  * cannot be made, the allocation is allowed to fail.
 261                  */
 262                 if ((lflags & GFP_KERNEL) == GFP_KERNEL)
 263                         lflags |= __GFP_RETRY_MAYFAIL;
 264
 265                 /*
 266                  * Use cond_resched() instead of congestion_wait() to avoid
 267                  * deadlocking systems where there are no block devices.
 268                  */
 269                 cond_resched();
 270         } while (1);
 271
 272         return (NULL);
 273 }
 274
 275 inline void
 276 spl_kmem_free_impl(const void *buf, size_t size)
 277 {
 278         if (is_vmalloc_addr(buf))
 279                 vfree(buf);
 280         else
 281                 kfree(buf);
 282 }
 283
 284 /*
 285  * Memory allocation and accounting for kmem_* * style allocations.  When
 286  * DEBUG_KMEM is enabled the total memory allocated will be tracked and
 287  * any memory leaked will be reported during module unload.
 288  *
 289  * ./configure --enable-debug-kmem
 290  */
 291 #ifdef DEBUG_KMEM
 292
 293 /* Shim layer memory accounting */
 294 #ifdef HAVE_ATOMIC64_T
 295 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 296 unsigned long long kmem_alloc_max = 0;
 297 #else  /* HAVE_ATOMIC64_T */
 298 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 299 unsigned long long kmem_alloc_max = 0;
 300 #endif /* HAVE_ATOMIC64_T */
 301
 302 EXPORT_SYMBOL(kmem_alloc_used);
 303 EXPORT_SYMBOL(kmem_alloc_max);
 304
 305 inline void *
 306 spl_kmem_alloc_debug(size_t size, int flags, int node)
 307 {
 308         void *ptr;
 309
 310         ptr = spl_kmem_alloc_impl(size, flags, node);
 311         if (ptr) {
 312                 kmem_alloc_used_add(size);
 313                 if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 314                         kmem_alloc_max = kmem_alloc_used_read();
 315         }
 316
 317         return (ptr);
 318 }
 319
 320 inline void
 321 spl_kmem_free_debug(const void *ptr, size_t size)
 322 {
 323         kmem_alloc_used_sub(size);
 324         spl_kmem_free_impl(ptr, size);
 325 }
 326
 327 /*
 328  * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
 329  * but also the location of every alloc and free.  When the SPL module is
 330  * unloaded a list of all leaked addresses and where they were allocated
 331  * will be dumped to the console.  Enabling this feature has a significant
 332  * impact on performance but it makes finding memory leaks straight forward.
 333  *
 334  * Not surprisingly with debugging enabled the xmem_locks are very highly
 335  * contended particularly on xfree().  If we want to run with this detailed
 336  * debugging enabled for anything other than debugging  we need to minimize
 337  * the contention by moving to a lock per xmem_table entry model.
 338  *
 339  * ./configure --enable-debug-kmem-tracking
 340  */
 341 #ifdef DEBUG_KMEM_TRACKING
 342
 343 #include <linux/hash.h>
 344 #include <linux/ctype.h>
 345
 346 #define KMEM_HASH_BITS          10
 347 #define KMEM_TABLE_SIZE         (1 << KMEM_HASH_BITS)
 348
 349 typedef struct kmem_debug {
 350         struct hlist_node kd_hlist;     /* Hash node linkage */
 351         struct list_head kd_list;       /* List of all allocations */
 352         void *kd_addr;                  /* Allocation pointer */
 353         size_t kd_size;                 /* Allocation size */
 354         const char *kd_func;            /* Allocation function */
 355         int kd_line;                    /* Allocation line */
 356 } kmem_debug_t;
 357
 358 static spinlock_t kmem_lock;
 359 static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 360 static struct list_head kmem_list;
 361
 362 static kmem_debug_t *
 363 kmem_del_init(spinlock_t *lock, struct hlist_head *table,
 364     int bits, const void *addr)
 365 {
 366         struct hlist_head *head;
 367         struct hlist_node *node = NULL;
 368         struct kmem_debug *p;
 369         unsigned long flags;
 370
 371         spin_lock_irqsave(lock, flags);
 372
 373         head = &table[hash_ptr((void *)addr, bits)];
 374         hlist_for_each(node, head) {
 375                 p = list_entry(node, struct kmem_debug, kd_hlist);
 376                 if (p->kd_addr == addr) {
 377                         hlist_del_init(&p->kd_hlist);
 378                         list_del_init(&p->kd_list);
 379                         spin_unlock_irqrestore(lock, flags);
 380                         return (p);
 381                 }
 382         }
 383
 384         spin_unlock_irqrestore(lock, flags);
 385
 386         return (NULL);
 387 }
 388
 389 inline void *
 390 spl_kmem_alloc_track(size_t size, int flags,
 391     const char *func, int line, int node)
 392 {
 393         void *ptr = NULL;
 394         kmem_debug_t *dptr;
 395         unsigned long irq_flags;
 396
 397         dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
 398         if (dptr == NULL)
 399                 return (NULL);
 400
 401         dptr->kd_func = __strdup(func, flags);
 402         if (dptr->kd_func == NULL) {
 403                 kfree(dptr);
 404                 return (NULL);
 405         }
 406
 407         ptr = spl_kmem_alloc_debug(size, flags, node);
 408         if (ptr == NULL) {
 409                 kfree(dptr->kd_func);
 410                 kfree(dptr);
 411                 return (NULL);
 412         }
 413
 414         INIT_HLIST_NODE(&dptr->kd_hlist);
 415         INIT_LIST_HEAD(&dptr->kd_list);
 416
 417         dptr->kd_addr = ptr;
 418         dptr->kd_size = size;
 419         dptr->kd_line = line;
 420
 421         spin_lock_irqsave(&kmem_lock, irq_flags);
 422         hlist_add_head(&dptr->kd_hlist,
 423             &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 424         list_add_tail(&dptr->kd_list, &kmem_list);
 425         spin_unlock_irqrestore(&kmem_lock, irq_flags);
 426
 427         return (ptr);
 428 }
 429
 430 inline void
 431 spl_kmem_free_track(const void *ptr, size_t size)
 432 {
 433         kmem_debug_t *dptr;
 434
 435         /* Ignore NULL pointer since we haven't tracked it at all */
 436         if (ptr == NULL)
 437                 return;
 438
 439         /* Must exist in hash due to kmem_alloc() */
 440         dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 441         ASSERT3P(dptr, !=, NULL);
 442         ASSERT3S(dptr->kd_size, ==, size);
 443
 444         kfree(dptr->kd_func);
 445         kfree(dptr);
 446
 447         spl_kmem_free_debug(ptr, size);
 448 }
 449 #endif /* DEBUG_KMEM_TRACKING */
 450 #endif /* DEBUG_KMEM */
 451
 452 /*
 453  * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
 454  */
 455 void *
 456 spl_kmem_alloc(size_t size, int flags, const char *func, int line)
 457 {
 458         ASSERT0(flags & ~KM_PUBLIC_MASK);
 459
 460 #if !defined(DEBUG_KMEM)
 461         return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 462 #elif !defined(DEBUG_KMEM_TRACKING)
 463         return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 464 #else
 465         return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 466 #endif
 467 }
 468 EXPORT_SYMBOL(spl_kmem_alloc);
 469
 470 void *
 471 spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
 472 {
 473         ASSERT0(flags & ~KM_PUBLIC_MASK);
 474
 475         flags |= KM_ZERO;
 476
 477 #if !defined(DEBUG_KMEM)
 478         return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 479 #elif !defined(DEBUG_KMEM_TRACKING)
 480         return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 481 #else
 482         return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 483 #endif
 484 }
 485 EXPORT_SYMBOL(spl_kmem_zalloc);
 486
 487 void
 488 spl_kmem_free(const void *buf, size_t size)
 489 {
 490 #if !defined(DEBUG_KMEM)
 491         return (spl_kmem_free_impl(buf, size));
 492 #elif !defined(DEBUG_KMEM_TRACKING)
 493         return (spl_kmem_free_debug(buf, size));
 494 #else
 495         return (spl_kmem_free_track(buf, size));
 496 #endif
 497 }
 498 EXPORT_SYMBOL(spl_kmem_free);
 499
 500 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
 501 static char *
 502 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
 503 {
 504         int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
 505         int i, flag = 1;
 506
 507         ASSERT(str != NULL && len >= 17);
 508         memset(str, 0, len);
 509
 510         /*
 511          * Check for a fully printable string, and while we are at
 512          * it place the printable characters in the passed buffer.
 513          */
 514         for (i = 0; i < size; i++) {
 515                 str[i] = ((char *)(kd->kd_addr))[i];
 516                 if (isprint(str[i])) {
 517                         continue;
 518                 } else {
 519                         /*
 520                          * Minimum number of printable characters found
 521                          * to make it worthwhile to print this as ascii.
 522                          */
 523                         if (i > min)
 524                                 break;
 525
 526                         flag = 0;
 527                         break;
 528                 }
 529         }
 530
 531         if (!flag) {
 532                 sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
 533                     *((uint8_t *)kd->kd_addr),
 534                     *((uint8_t *)kd->kd_addr + 2),
 535                     *((uint8_t *)kd->kd_addr + 4),
 536                     *((uint8_t *)kd->kd_addr + 6),
 537                     *((uint8_t *)kd->kd_addr + 8),
 538                     *((uint8_t *)kd->kd_addr + 10),
 539                     *((uint8_t *)kd->kd_addr + 12),
 540                     *((uint8_t *)kd->kd_addr + 14));
 541         }
 542
 543         return (str);
 544 }
 545
 546 static int
 547 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
 548 {
 549         int i;
 550
 551         spin_lock_init(lock);
 552         INIT_LIST_HEAD(list);
 553
 554         for (i = 0; i < size; i++)
 555                 INIT_HLIST_HEAD(&kmem_table[i]);
 556
 557         return (0);
 558 }
 559
 560 static void
 561 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
 562 {
 563         unsigned long flags;
 564         kmem_debug_t *kd = NULL;
 565         char str[17];
 566
 567         spin_lock_irqsave(lock, flags);
 568         if (!list_empty(list))
 569                 printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
 570                     "size", "data", "func", "line");
 571
 572         list_for_each_entry(kd, list, kd_list) {
 573                 printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
 574                     (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
 575                     kd->kd_func, kd->kd_line);
 576         }
 577
 578         spin_unlock_irqrestore(lock, flags);
 579 }
 580 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
 581
 582 int
 583 spl_kmem_init(void)
 584 {
 585
 586 #ifdef DEBUG_KMEM
 587         kmem_alloc_used_set(0);
 588
 589
 590
 591 #ifdef DEBUG_KMEM_TRACKING
 592         spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
 593 #endif /* DEBUG_KMEM_TRACKING */
 594 #endif /* DEBUG_KMEM */
 595
 596         return (0);
 597 }
 598
 599 void
 600 spl_kmem_fini(void)
 601 {
 602 #ifdef DEBUG_KMEM
 603         /*
 604          * Display all unreclaimed memory addresses, including the
 605          * allocation size and the first few bytes of what's located
 606          * at that address to aid in debugging.  Performance is not
 607          * a serious concern here since it is module unload time.
 608          */
 609         if (kmem_alloc_used_read() != 0)
 610                 printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
 611                     (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
 612
 613 #ifdef DEBUG_KMEM_TRACKING
 614         spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
 615 #endif /* DEBUG_KMEM_TRACKING */
 616 #endif /* DEBUG_KMEM */
 617 }