File: /usr/src/linux/mm/slab.c
1 /*
2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk)
5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays
9 * (c) 2000 Manfred Spraul
10 *
11 * An implementation of the Slab Allocator as described in outline in;
12 * UNIX Internals: The New Frontiers by Uresh Vahalia
13 * Pub: Prentice Hall ISBN 0-13-101908-2
14 * or with a little more detail in;
15 * The Slab Allocator: An Object-Caching Kernel Memory Allocator
16 * Jeff Bonwick (Sun Microsystems).
17 * Presented at: USENIX Summer 1994 Technical Conference
18 *
19 *
20 * The memory is organized in caches, one cache for each object type.
21 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
22 * Each cache consists out of many slabs (they are small (usually one
23 * page long) and always contiguous), and each slab contains multiple
24 * initialized objects.
25 *
26 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
27 * normal). If you need a special memory type, then must create a new
28 * cache for that memory type.
29 *
30 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
31 * full slabs with 0 free objects
32 * partial slabs
33 * empty slabs with no allocated objects
34 *
35 * If partial slabs exist, then new allocations come from these slabs,
36 * otherwise from empty slabs or new slabs are allocated.
37 *
38 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
39 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
40 *
41 * On SMP systems, each cache has a short per-cpu head array, most allocs
42 * and frees go into that array, and if that array overflows, then 1/2
43 * of the entries in the array are given back into the global cache.
44 * This reduces the number of spinlock operations.
45 *
46 * The c_cpuarray may not be read with enabled local interrupts.
47 *
48 * SMP synchronization:
49 * constructors and destructors are called without any locking.
50 * Several members in kmem_cache_t and slab_t never change, they
51 * are accessed without any locking.
52 * The per-cpu arrays are never accessed from the wrong cpu, no locking.
53 * The non-constant members are protected with a per-cache irq spinlock.
54 *
55 * Further notes from the original documentation:
56 *
57 * 11 April '97. Started multi-threading - markhe
58 * The global cache-chain is protected by the semaphore 'cache_chain_sem'.
59 * The sem is only needed when accessing/extending the cache-chain, which
60 * can never happen inside an interrupt (kmem_cache_create(),
61 * kmem_cache_shrink() and kmem_cache_reap()).
62 *
63 * To prevent kmem_cache_shrink() trying to shrink a 'growing' cache (which
64 * maybe be sleeping and therefore not holding the semaphore/lock), the
65 * growing field is used. This also prevents reaping from a cache.
66 *
67 * At present, each engine can be growing a cache. This should be blocked.
68 *
69 */
70
71 #include <linux/config.h>
72 #include <linux/slab.h>
73 #include <linux/interrupt.h>
74 #include <linux/init.h>
75 #include <linux/compiler.h>
76 #include <asm/uaccess.h>
77
78 /*
79 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
80 * SLAB_RED_ZONE & SLAB_POISON.
81 * 0 for faster, smaller code (especially in the critical paths).
82 *
83 * STATS - 1 to collect stats for /proc/slabinfo.
84 * 0 for faster, smaller code (especially in the critical paths).
85 *
86 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
87 */
88
89 #ifdef CONFIG_DEBUG_SLAB
90 #define DEBUG 1
91 #define STATS 1
92 #define FORCED_DEBUG 1
93 #else
94 #define DEBUG 0
95 #define STATS 0
96 #define FORCED_DEBUG 0
97 #endif
98
99 /*
100 * Parameters for kmem_cache_reap
101 */
102 #define REAP_SCANLEN 10
103 #define REAP_PERFECT 10
104
105 /* Shouldn't this be in a header file somewhere? */
106 #define BYTES_PER_WORD sizeof(void *)
107
108 /* Legal flag mask for kmem_cache_create(). */
109 #if DEBUG
110 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
111 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
112 SLAB_NO_REAP | SLAB_CACHE_DMA)
113 #else
114 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | SLAB_CACHE_DMA)
115 #endif
116
117 /*
118 * kmem_bufctl_t:
119 *
120 * Bufctl's are used for linking objs within a slab
121 * linked offsets.
122 *
123 * This implementaion relies on "struct page" for locating the cache &
124 * slab an object belongs to.
125 * This allows the bufctl structure to be small (one int), but limits
126 * the number of objects a slab (not a cache) can contain when off-slab
127 * bufctls are used. The limit is the size of the largest general cache
128 * that does not use off-slab slabs.
129 * For 32bit archs with 4 kB pages, is this 56.
130 * This is not serious, as it is only for large objects, when it is unwise
131 * to have too many per slab.
132 * Note: This limit can be raised by introducing a general cache whose size
133 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
134 */
135
136 #define BUFCTL_END 0xffffFFFF
137 #define SLAB_LIMIT 0xffffFFFE
138 typedef unsigned int kmem_bufctl_t;
139
140 /* Max number of objs-per-slab for caches which use off-slab slabs.
141 * Needed to avoid a possible looping condition in kmem_cache_grow().
142 */
143 static unsigned long offslab_limit;
144
145 /*
146 * slab_t
147 *
148 * Manages the objs in a slab. Placed either at the beginning of mem allocated
149 * for a slab, or allocated from an general cache.
150 * Slabs are chained into three list: fully used, partial, fully free slabs.
151 */
152 typedef struct slab_s {
153 struct list_head list;
154 unsigned long colouroff;
155 void *s_mem; /* including colour offset */
156 unsigned int inuse; /* num of objs active in slab */
157 kmem_bufctl_t free;
158 } slab_t;
159
160 #define slab_bufctl(slabp) \
161 ((kmem_bufctl_t *)(((slab_t*)slabp)+1))
162
163 /*
164 * cpucache_t
165 *
166 * Per cpu structures
167 * The limit is stored in the per-cpu structure to reduce the data cache
168 * footprint.
169 */
170 typedef struct cpucache_s {
171 unsigned int avail;
172 unsigned int limit;
173 } cpucache_t;
174
175 #define cc_entry(cpucache) \
176 ((void **)(((cpucache_t*)(cpucache))+1))
177 #define cc_data(cachep) \
178 ((cachep)->cpudata[smp_processor_id()])
179 /*
180 * kmem_cache_t
181 *
182 * manages a cache.
183 */
184
185 #define CACHE_NAMELEN 20 /* max name length for a slab cache */
186
187 struct kmem_cache_s {
188 /* 1) each alloc & free */
189 /* full, partial first, then free */
190 struct list_head slabs_full;
191 struct list_head slabs_partial;
192 struct list_head slabs_free;
193 unsigned int objsize;
194 unsigned int flags; /* constant flags */
195 unsigned int num; /* # of objs per slab */
196 spinlock_t spinlock;
197 #ifdef CONFIG_SMP
198 unsigned int batchcount;
199 #endif
200
201 /* 2) slab additions /removals */
202 /* order of pgs per slab (2^n) */
203 unsigned int gfporder;
204
205 /* force GFP flags, e.g. GFP_DMA */
206 unsigned int gfpflags;
207
208 size_t colour; /* cache colouring range */
209 unsigned int colour_off; /* colour offset */
210 unsigned int colour_next; /* cache colouring */
211 kmem_cache_t *slabp_cache;
212 unsigned int growing;
213 unsigned int dflags; /* dynamic flags */
214
215 /* constructor func */
216 void (*ctor)(void *, kmem_cache_t *, unsigned long);
217
218 /* de-constructor func */
219 void (*dtor)(void *, kmem_cache_t *, unsigned long);
220
221 unsigned long failures;
222
223 /* 3) cache creation/removal */
224 char name[CACHE_NAMELEN];
225 struct list_head next;
226 #ifdef CONFIG_SMP
227 /* 4) per-cpu data */
228 cpucache_t *cpudata[NR_CPUS];
229 #endif
230 #if STATS
231 unsigned long num_active;
232 unsigned long num_allocations;
233 unsigned long high_mark;
234 unsigned long grown;
235 unsigned long reaped;
236 unsigned long errors;
237 #ifdef CONFIG_SMP
238 atomic_t allochit;
239 atomic_t allocmiss;
240 atomic_t freehit;
241 atomic_t freemiss;
242 #endif
243 #endif
244 };
245
246 /* internal c_flags */
247 #define CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */
248 #define CFLGS_OPTIMIZE 0x020000UL /* optimized slab lookup */
249
250 /* c_dflags (dynamic flags). Need to hold the spinlock to access this member */
251 #define DFLGS_GROWN 0x000001UL /* don't reap a recently grown */
252
253 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
254 #define OPTIMIZE(x) ((x)->flags & CFLGS_OPTIMIZE)
255 #define GROWN(x) ((x)->dlags & DFLGS_GROWN)
256
257 #if STATS
258 #define STATS_INC_ACTIVE(x) ((x)->num_active++)
259 #define STATS_DEC_ACTIVE(x) ((x)->num_active--)
260 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
261 #define STATS_INC_GROWN(x) ((x)->grown++)
262 #define STATS_INC_REAPED(x) ((x)->reaped++)
263 #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \
264 (x)->high_mark = (x)->num_active; \
265 } while (0)
266 #define STATS_INC_ERR(x) ((x)->errors++)
267 #else
268 #define STATS_INC_ACTIVE(x) do { } while (0)
269 #define STATS_DEC_ACTIVE(x) do { } while (0)
270 #define STATS_INC_ALLOCED(x) do { } while (0)
271 #define STATS_INC_GROWN(x) do { } while (0)
272 #define STATS_INC_REAPED(x) do { } while (0)
273 #define STATS_SET_HIGH(x) do { } while (0)
274 #define STATS_INC_ERR(x) do { } while (0)
275 #endif
276
277 #if STATS && defined(CONFIG_SMP)
278 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
279 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
280 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
281 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
282 #else
283 #define STATS_INC_ALLOCHIT(x) do { } while (0)
284 #define STATS_INC_ALLOCMISS(x) do { } while (0)
285 #define STATS_INC_FREEHIT(x) do { } while (0)
286 #define STATS_INC_FREEMISS(x) do { } while (0)
287 #endif
288
289 #if DEBUG
290 /* Magic nums for obj red zoning.
291 * Placed in the first word before and the first word after an obj.
292 */
293 #define RED_MAGIC1 0x5A2CF071UL /* when obj is active */
294 #define RED_MAGIC2 0x170FC2A5UL /* when obj is inactive */
295
296 /* ...and for poisoning */
297 #define POISON_BYTE 0x5a /* byte value for poisoning */
298 #define POISON_END 0xa5 /* end-byte of poisoning */
299
300 #endif
301
302 /* maximum size of an obj (in 2^order pages) */
303 #define MAX_OBJ_ORDER 5 /* 32 pages */
304
305 /*
306 * Do not go above this order unless 0 objects fit into the slab.
307 */
308 #define BREAK_GFP_ORDER_HI 2
309 #define BREAK_GFP_ORDER_LO 1
310 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
311
312 /*
313 * Absolute limit for the gfp order
314 */
315 #define MAX_GFP_ORDER 5 /* 32 pages */
316
317
318 /* Macros for storing/retrieving the cachep and or slab from the
319 * global 'mem_map'. These are used to find the slab an obj belongs to.
320 * With kfree(), these are used to find the cache which an obj belongs to.
321 */
322 #define SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x))
323 #define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next)
324 #define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x))
325 #define GET_PAGE_SLAB(pg) ((slab_t *)(pg)->list.prev)
326
327 /* Size description struct for general caches. */
328 typedef struct cache_sizes {
329 size_t cs_size;
330 kmem_cache_t *cs_cachep;
331 kmem_cache_t *cs_dmacachep;
332 } cache_sizes_t;
333
334 static cache_sizes_t cache_sizes[] = {
335 #if PAGE_SIZE == 4096
336 { 32, NULL, NULL},
337 #endif
338 { 64, NULL, NULL},
339 { 128, NULL, NULL},
340 { 256, NULL, NULL},
341 { 512, NULL, NULL},
342 { 1024, NULL, NULL},
343 { 2048, NULL, NULL},
344 { 4096, NULL, NULL},
345 { 8192, NULL, NULL},
346 { 16384, NULL, NULL},
347 { 32768, NULL, NULL},
348 { 65536, NULL, NULL},
349 {131072, NULL, NULL},
350 { 0, NULL, NULL}
351 };
352
353 /* internal cache of cache description objs */
354 static kmem_cache_t cache_cache = {
355 slabs_full: LIST_HEAD_INIT(cache_cache.slabs_full),
356 slabs_partial: LIST_HEAD_INIT(cache_cache.slabs_partial),
357 slabs_free: LIST_HEAD_INIT(cache_cache.slabs_free),
358 objsize: sizeof(kmem_cache_t),
359 flags: SLAB_NO_REAP,
360 spinlock: SPIN_LOCK_UNLOCKED,
361 colour_off: L1_CACHE_BYTES,
362 name: "kmem_cache",
363 };
364
365 /* Guard access to the cache-chain. */
366 static struct semaphore cache_chain_sem;
367
368 /* Place maintainer for reaping. */
369 static kmem_cache_t *clock_searchp = &cache_cache;
370
371 #define cache_chain (cache_cache.next)
372
373 #ifdef CONFIG_SMP
374 /*
375 * chicken and egg problem: delay the per-cpu array allocation
376 * until the general caches are up.
377 */
378 static int g_cpucache_up;
379
380 static void enable_cpucache (kmem_cache_t *cachep);
381 static void enable_all_cpucaches (void);
382 #endif
383
384 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
385 static void kmem_cache_estimate (unsigned long gfporder, size_t size,
386 int flags, size_t *left_over, unsigned int *num)
387 {
388 int i;
389 size_t wastage = PAGE_SIZE<<gfporder;
390 size_t extra = 0;
391 size_t base = 0;
392
393 if (!(flags & CFLGS_OFF_SLAB)) {
394 base = sizeof(slab_t);
395 extra = sizeof(kmem_bufctl_t);
396 }
397 i = 0;
398 while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
399 i++;
400 if (i > 0)
401 i--;
402
403 if (i > SLAB_LIMIT)
404 i = SLAB_LIMIT;
405
406 *num = i;
407 wastage -= i*size;
408 wastage -= L1_CACHE_ALIGN(base+i*extra);
409 *left_over = wastage;
410 }
411
412 /* Initialisation - setup the `cache' cache. */
413 void __init kmem_cache_init(void)
414 {
415 size_t left_over;
416
417 init_MUTEX(&cache_chain_sem);
418 INIT_LIST_HEAD(&cache_chain);
419
420 kmem_cache_estimate(0, cache_cache.objsize, 0,
421 &left_over, &cache_cache.num);
422 if (!cache_cache.num)
423 BUG();
424
425 cache_cache.colour = left_over/cache_cache.colour_off;
426 cache_cache.colour_next = 0;
427 }
428
429
430 /* Initialisation - setup remaining internal and general caches.
431 * Called after the gfp() functions have been enabled, and before smp_init().
432 */
433 void __init kmem_cache_sizes_init(void)
434 {
435 cache_sizes_t *sizes = cache_sizes;
436 char name[20];
437 /*
438 * Fragmentation resistance on low memory - only use bigger
439 * page orders on machines with more than 32MB of memory.
440 */
441 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
442 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
443 do {
444 /* For performance, all the general caches are L1 aligned.
445 * This should be particularly beneficial on SMP boxes, as it
446 * eliminates "false sharing".
447 * Note for systems short on memory removing the alignment will
448 * allow tighter packing of the smaller caches. */
449 sprintf(name,"size-%Zd",sizes->cs_size);
450 if (!(sizes->cs_cachep =
451 kmem_cache_create(name, sizes->cs_size,
452 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) {
453 BUG();
454 }
455
456 /* Inc off-slab bufctl limit until the ceiling is hit. */
457 if (!(OFF_SLAB(sizes->cs_cachep))) {
458 offslab_limit = sizes->cs_size-sizeof(slab_t);
459 offslab_limit /= 2;
460 }
461 sprintf(name, "size-%Zd(DMA)",sizes->cs_size);
462 sizes->cs_dmacachep = kmem_cache_create(name, sizes->cs_size, 0,
463 SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
464 if (!sizes->cs_dmacachep)
465 BUG();
466 sizes++;
467 } while (sizes->cs_size);
468 }
469
470 int __init kmem_cpucache_init(void)
471 {
472 #ifdef CONFIG_SMP
473 g_cpucache_up = 1;
474 enable_all_cpucaches();
475 #endif
476 return 0;
477 }
478
479 __initcall(kmem_cpucache_init);
480
481 /* Interface to system's page allocator. No need to hold the cache-lock.
482 */
483 static inline void * kmem_getpages (kmem_cache_t *cachep, unsigned long flags)
484 {
485 void *addr;
486
487 /*
488 * If we requested dmaable memory, we will get it. Even if we
489 * did not request dmaable memory, we might get it, but that
490 * would be relatively rare and ignorable.
491 */
492 flags |= cachep->gfpflags;
493 addr = (void*) __get_free_pages(flags, cachep->gfporder);
494 /* Assume that now we have the pages no one else can legally
495 * messes with the 'struct page's.
496 * However vm_scan() might try to test the structure to see if
497 * it is a named-page or buffer-page. The members it tests are
498 * of no interest here.....
499 */
500 return addr;
501 }
502
503 /* Interface to system's page release. */
504 static inline void kmem_freepages (kmem_cache_t *cachep, void *addr)
505 {
506 unsigned long i = (1<<cachep->gfporder);
507 struct page *page = virt_to_page(addr);
508
509 /* free_pages() does not clear the type bit - we do that.
510 * The pages have been unlinked from their cache-slab,
511 * but their 'struct page's might be accessed in
512 * vm_scan(). Shouldn't be a worry.
513 */
514 while (i--) {
515 PageClearSlab(page);
516 page++;
517 }
518 free_pages((unsigned long)addr, cachep->gfporder);
519 }
520
521 #if DEBUG
522 static inline void kmem_poison_obj (kmem_cache_t *cachep, void *addr)
523 {
524 int size = cachep->objsize;
525 if (cachep->flags & SLAB_RED_ZONE) {
526 addr += BYTES_PER_WORD;
527 size -= 2*BYTES_PER_WORD;
528 }
529 memset(addr, POISON_BYTE, size);
530 *(unsigned char *)(addr+size-1) = POISON_END;
531 }
532
533 static inline int kmem_check_poison_obj (kmem_cache_t *cachep, void *addr)
534 {
535 int size = cachep->objsize;
536 void *end;
537 if (cachep->flags & SLAB_RED_ZONE) {
538 addr += BYTES_PER_WORD;
539 size -= 2*BYTES_PER_WORD;
540 }
541 end = memchr(addr, POISON_END, size);
542 if (end != (addr+size-1))
543 return 1;
544 return 0;
545 }
546 #endif
547
548 /* Destroy all the objs in a slab, and release the mem back to the system.
549 * Before calling the slab must have been unlinked from the cache.
550 * The cache-lock is not held/needed.
551 */
552 static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp)
553 {
554 if (cachep->dtor
555 #if DEBUG
556 || cachep->flags & (SLAB_POISON | SLAB_RED_ZONE)
557 #endif
558 ) {
559 int i;
560 for (i = 0; i < cachep->num; i++) {
561 void* objp = slabp->s_mem+cachep->objsize*i;
562 #if DEBUG
563 if (cachep->flags & SLAB_RED_ZONE) {
564 if (*((unsigned long*)(objp)) != RED_MAGIC1)
565 BUG();
566 if (*((unsigned long*)(objp + cachep->objsize
567 -BYTES_PER_WORD)) != RED_MAGIC1)
568 BUG();
569 objp += BYTES_PER_WORD;
570 }
571 #endif
572 if (cachep->dtor)
573 (cachep->dtor)(objp, cachep, 0);
574 #if DEBUG
575 if (cachep->flags & SLAB_RED_ZONE) {
576 objp -= BYTES_PER_WORD;
577 }
578 if ((cachep->flags & SLAB_POISON) &&
579 kmem_check_poison_obj(cachep, objp))
580 BUG();
581 #endif
582 }
583 }
584
585 kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
586 if (OFF_SLAB(cachep))
587 kmem_cache_free(cachep->slabp_cache, slabp);
588 }
589
590 /**
591 * kmem_cache_create - Create a cache.
592 * @name: A string which is used in /proc/slabinfo to identify this cache.
593 * @size: The size of objects to be created in this cache.
594 * @offset: The offset to use within the page.
595 * @flags: SLAB flags
596 * @ctor: A constructor for the objects.
597 * @dtor: A destructor for the objects.
598 *
599 * Returns a ptr to the cache on success, NULL on failure.
600 * Cannot be called within a int, but can be interrupted.
601 * The @ctor is run when new pages are allocated by the cache
602 * and the @dtor is run before the pages are handed back.
603 * The flags are
604 *
605 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
606 * to catch references to uninitialised memory.
607 *
608 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
609 * for buffer overruns.
610 *
611 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
612 * memory pressure.
613 *
614 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
615 * cacheline. This can be beneficial if you're counting cycles as closely
616 * as davem.
617 */
618 kmem_cache_t *
619 kmem_cache_create (const char *name, size_t size, size_t offset,
620 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
621 void (*dtor)(void*, kmem_cache_t *, unsigned long))
622 {
623 const char *func_nm = KERN_ERR "kmem_create: ";
624 size_t left_over, align, slab_size;
625 kmem_cache_t *cachep = NULL;
626
627 /*
628 * Sanity checks... these are all serious usage bugs.
629 */
630 if ((!name) ||
631 ((strlen(name) >= CACHE_NAMELEN - 1)) ||
632 in_interrupt() ||
633 (size < BYTES_PER_WORD) ||
634 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
635 (dtor && !ctor) ||
636 (offset < 0 || offset > size))
637 BUG();
638
639 #if DEBUG
640 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
641 /* No constructor, but inital state check requested */
642 printk("%sNo con, but init state check requested - %s\n", func_nm, name);
643 flags &= ~SLAB_DEBUG_INITIAL;
644 }
645
646 if ((flags & SLAB_POISON) && ctor) {
647 /* request for poisoning, but we can't do that with a constructor */
648 printk("%sPoisoning requested, but con given - %s\n", func_nm, name);
649 flags &= ~SLAB_POISON;
650 }
651 #if FORCED_DEBUG
652 if (size < (PAGE_SIZE>>3))
653 /*
654 * do not red zone large object, causes severe
655 * fragmentation.
656 */
657 flags |= SLAB_RED_ZONE;
658 if (!ctor)
659 flags |= SLAB_POISON;
660 #endif
661 #endif
662
663 /*
664 * Always checks flags, a caller might be expecting debug
665 * support which isn't available.
666 */
667 if (flags & ~CREATE_MASK)
668 BUG();
669
670 /* Get cache's description obj. */
671 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
672 if (!cachep)
673 goto opps;
674 memset(cachep, 0, sizeof(kmem_cache_t));
675
676 /* Check that size is in terms of words. This is needed to avoid
677 * unaligned accesses for some archs when redzoning is used, and makes
678 * sure any on-slab bufctl's are also correctly aligned.
679 */
680 if (size & (BYTES_PER_WORD-1)) {
681 size += (BYTES_PER_WORD-1);
682 size &= ~(BYTES_PER_WORD-1);
683 printk("%sForcing size word alignment - %s\n", func_nm, name);
684 }
685
686 #if DEBUG
687 if (flags & SLAB_RED_ZONE) {
688 /*
689 * There is no point trying to honour cache alignment
690 * when redzoning.
691 */
692 flags &= ~SLAB_HWCACHE_ALIGN;
693 size += 2*BYTES_PER_WORD; /* words for redzone */
694 }
695 #endif
696 align = BYTES_PER_WORD;
697 if (flags & SLAB_HWCACHE_ALIGN)
698 align = L1_CACHE_BYTES;
699
700 /* Determine if the slab management is 'on' or 'off' slab. */
701 if (size >= (PAGE_SIZE>>3))
702 /*
703 * Size is large, assume best to place the slab management obj
704 * off-slab (should allow better packing of objs).
705 */
706 flags |= CFLGS_OFF_SLAB;
707
708 if (flags & SLAB_HWCACHE_ALIGN) {
709 /* Need to adjust size so that objs are cache aligned. */
710 /* Small obj size, can get at least two per cache line. */
711 /* FIXME: only power of 2 supported, was better */
712 while (size < align/2)
713 align /= 2;
714 size = (size+align-1)&(~(align-1));
715 }
716
717 /* Cal size (in pages) of slabs, and the num of objs per slab.
718 * This could be made much more intelligent. For now, try to avoid
719 * using high page-orders for slabs. When the gfp() funcs are more
720 * friendly towards high-order requests, this should be changed.
721 */
722 do {
723 unsigned int break_flag = 0;
724 cal_wastage:
725 kmem_cache_estimate(cachep->gfporder, size, flags,
726 &left_over, &cachep->num);
727 if (break_flag)
728 break;
729 if (cachep->gfporder >= MAX_GFP_ORDER)
730 break;
731 if (!cachep->num)
732 goto next;
733 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {
734 /* Oops, this num of objs will cause problems. */
735 cachep->gfporder--;
736 break_flag++;
737 goto cal_wastage;
738 }
739
740 /*
741 * Large num of objs is good, but v. large slabs are currently
742 * bad for the gfp()s.
743 */
744 if (cachep->gfporder >= slab_break_gfp_order)
745 break;
746
747 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
748 break; /* Acceptable internal fragmentation. */
749 next:
750 cachep->gfporder++;
751 } while (1);
752
753 if (!cachep->num) {
754 printk("kmem_cache_create: couldn't create cache %s.\n", name);
755 kmem_cache_free(&cache_cache, cachep);
756 cachep = NULL;
757 goto opps;
758 }
759 slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(slab_t));
760
761 /*
762 * If the slab has been placed off-slab, and we have enough space then
763 * move it on-slab. This is at the expense of any extra colouring.
764 */
765 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
766 flags &= ~CFLGS_OFF_SLAB;
767 left_over -= slab_size;
768 }
769
770 /* Offset must be a multiple of the alignment. */
771 offset += (align-1);
772 offset &= ~(align-1);
773 if (!offset)
774 offset = L1_CACHE_BYTES;
775 cachep->colour_off = offset;
776 cachep->colour = left_over/offset;
777
778 /* init remaining fields */
779 if (!cachep->gfporder && !(flags & CFLGS_OFF_SLAB))
780 flags |= CFLGS_OPTIMIZE;
781
782 cachep->flags = flags;
783 cachep->gfpflags = 0;
784 if (flags & SLAB_CACHE_DMA)
785 cachep->gfpflags |= GFP_DMA;
786 spin_lock_init(&cachep->spinlock);
787 cachep->objsize = size;
788 INIT_LIST_HEAD(&cachep->slabs_full);
789 INIT_LIST_HEAD(&cachep->slabs_partial);
790 INIT_LIST_HEAD(&cachep->slabs_free);
791
792 if (flags & CFLGS_OFF_SLAB)
793 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
794 cachep->ctor = ctor;
795 cachep->dtor = dtor;
796 /* Copy name over so we don't have problems with unloaded modules */
797 strcpy(cachep->name, name);
798
799 #ifdef CONFIG_SMP
800 if (g_cpucache_up)
801 enable_cpucache(cachep);
802 #endif
803 /* Need the semaphore to access the chain. */
804 down(&cache_chain_sem);
805 {
806 struct list_head *p;
807
808 list_for_each(p, &cache_chain) {
809 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
810
811 /* The name field is constant - no lock needed. */
812 if (!strcmp(pc->name, name))
813 BUG();
814 }
815 }
816
817 /* There is no reason to lock our new cache before we
818 * link it in - no one knows about it yet...
819 */
820 list_add(&cachep->next, &cache_chain);
821 up(&cache_chain_sem);
822 opps:
823 return cachep;
824 }
825
826
827 #if DEBUG
828 /*
829 * This check if the kmem_cache_t pointer is chained in the cache_cache
830 * list. -arca
831 */
832 static int is_chained_kmem_cache(kmem_cache_t * cachep)
833 {
834 struct list_head *p;
835 int ret = 0;
836
837 /* Find the cache in the chain of caches. */
838 down(&cache_chain_sem);
839 list_for_each(p, &cache_chain) {
840 if (p == &cachep->next) {
841 ret = 1;
842 break;
843 }
844 }
845 up(&cache_chain_sem);
846
847 return ret;
848 }
849 #else
850 #define is_chained_kmem_cache(x) 1
851 #endif
852
853 #ifdef CONFIG_SMP
854 /*
855 * Waits for all CPUs to execute func().
856 */
857 static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
858 {
859 local_irq_disable();
860 func(arg);
861 local_irq_enable();
862
863 if (smp_call_function(func, arg, 1, 1))
864 BUG();
865 }
866 typedef struct ccupdate_struct_s
867 {
868 kmem_cache_t *cachep;
869 cpucache_t *new[NR_CPUS];
870 } ccupdate_struct_t;
871
872 static void do_ccupdate_local(void *info)
873 {
874 ccupdate_struct_t *new = (ccupdate_struct_t *)info;
875 cpucache_t *old = cc_data(new->cachep);
876
877 cc_data(new->cachep) = new->new[smp_processor_id()];
878 new->new[smp_processor_id()] = old;
879 }
880
881 static void free_block (kmem_cache_t* cachep, void** objpp, int len);
882
883 static void drain_cpu_caches(kmem_cache_t *cachep)
884 {
885 ccupdate_struct_t new;
886 int i;
887
888 memset(&new.new,0,sizeof(new.new));
889
890 new.cachep = cachep;
891
892 down(&cache_chain_sem);
893 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
894
895 for (i = 0; i < smp_num_cpus; i++) {
896 cpucache_t* ccold = new.new[cpu_logical_map(i)];
897 if (!ccold || (ccold->avail == 0))
898 continue;
899 local_irq_disable();
900 free_block(cachep, cc_entry(ccold), ccold->avail);
901 local_irq_enable();
902 ccold->avail = 0;
903 }
904 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
905 up(&cache_chain_sem);
906 }
907
908 #else
909 #define drain_cpu_caches(cachep) do { } while (0)
910 #endif
911
912 static int __kmem_cache_shrink(kmem_cache_t *cachep)
913 {
914 slab_t *slabp;
915 int ret;
916
917 drain_cpu_caches(cachep);
918
919 spin_lock_irq(&cachep->spinlock);
920
921 /* If the cache is growing, stop shrinking. */
922 while (!cachep->growing) {
923 struct list_head *p;
924
925 p = cachep->slabs_free.prev;
926 if (p == &cachep->slabs_free)
927 break;
928
929 slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
930 #if DEBUG
931 if (slabp->inuse)
932 BUG();
933 #endif
934 list_del(&slabp->list);
935
936 spin_unlock_irq(&cachep->spinlock);
937 kmem_slab_destroy(cachep, slabp);
938 spin_lock_irq(&cachep->spinlock);
939 }
940 ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial);
941 spin_unlock_irq(&cachep->spinlock);
942 return ret;
943 }
944
945 /**
946 * kmem_cache_shrink - Shrink a cache.
947 * @cachep: The cache to shrink.
948 *
949 * Releases as many slabs as possible for a cache.
950 * To help debugging, a zero exit status indicates all slabs were released.
951 */
952 int kmem_cache_shrink(kmem_cache_t *cachep)
953 {
954 if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep))
955 BUG();
956
957 return __kmem_cache_shrink(cachep);
958 }
959
960 /**
961 * kmem_cache_destroy - delete a cache
962 * @cachep: the cache to destroy
963 *
964 * Remove a kmem_cache_t object from the slab cache.
965 * Returns 0 on success.
966 *
967 * It is expected this function will be called by a module when it is
968 * unloaded. This will remove the cache completely, and avoid a duplicate
969 * cache being allocated each time a module is loaded and unloaded, if the
970 * module doesn't have persistent in-kernel storage across loads and unloads.
971 *
972 * The caller must guarantee that noone will allocate memory from the cache
973 * during the kmem_cache_destroy().
974 */
975 int kmem_cache_destroy (kmem_cache_t * cachep)
976 {
977 if (!cachep || in_interrupt() || cachep->growing)
978 BUG();
979
980 /* Find the cache in the chain of caches. */
981 down(&cache_chain_sem);
982 /* the chain is never empty, cache_cache is never destroyed */
983 if (clock_searchp == cachep)
984 clock_searchp = list_entry(cachep->next.next,
985 kmem_cache_t, next);
986 list_del(&cachep->next);
987 up(&cache_chain_sem);
988
989 if (__kmem_cache_shrink(cachep)) {
990 printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n",
991 cachep);
992 down(&cache_chain_sem);
993 list_add(&cachep->next,&cache_chain);
994 up(&cache_chain_sem);
995 return 1;
996 }
997 #ifdef CONFIG_SMP
998 {
999 int i;
1000 for (i = 0; i < NR_CPUS; i++)
1001 kfree(cachep->cpudata[i]);
1002 }
1003 #endif
1004 kmem_cache_free(&cache_cache, cachep);
1005
1006 return 0;
1007 }
1008
1009 /* Get the memory for a slab management obj. */
1010 static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep,
1011 void *objp, int colour_off, int local_flags)
1012 {
1013 slab_t *slabp;
1014
1015 if (OFF_SLAB(cachep)) {
1016 /* Slab management obj is off-slab. */
1017 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
1018 if (!slabp)
1019 return NULL;
1020 } else {
1021 /* FIXME: change to
1022 slabp = objp
1023 * if you enable OPTIMIZE
1024 */
1025 slabp = objp+colour_off;
1026 colour_off += L1_CACHE_ALIGN(cachep->num *
1027 sizeof(kmem_bufctl_t) + sizeof(slab_t));
1028 }
1029 slabp->inuse = 0;
1030 slabp->colouroff = colour_off;
1031 slabp->s_mem = objp+colour_off;
1032
1033 return slabp;
1034 }
1035
1036 static inline void kmem_cache_init_objs (kmem_cache_t * cachep,
1037 slab_t * slabp, unsigned long ctor_flags)
1038 {
1039 int i;
1040
1041 for (i = 0; i < cachep->num; i++) {
1042 void* objp = slabp->s_mem+cachep->objsize*i;
1043 #if DEBUG
1044 if (cachep->flags & SLAB_RED_ZONE) {
1045 *((unsigned long*)(objp)) = RED_MAGIC1;
1046 *((unsigned long*)(objp + cachep->objsize -
1047 BYTES_PER_WORD)) = RED_MAGIC1;
1048 objp += BYTES_PER_WORD;
1049 }
1050 #endif
1051
1052 /*
1053 * Constructors are not allowed to allocate memory from
1054 * the same cache which they are a constructor for.
1055 * Otherwise, deadlock. They must also be threaded.
1056 */
1057 if (cachep->ctor)
1058 cachep->ctor(objp, cachep, ctor_flags);
1059 #if DEBUG
1060 if (cachep->flags & SLAB_RED_ZONE)
1061 objp -= BYTES_PER_WORD;
1062 if (cachep->flags & SLAB_POISON)
1063 /* need to poison the objs */
1064 kmem_poison_obj(cachep, objp);
1065 if (cachep->flags & SLAB_RED_ZONE) {
1066 if (*((unsigned long*)(objp)) != RED_MAGIC1)
1067 BUG();
1068 if (*((unsigned long*)(objp + cachep->objsize -
1069 BYTES_PER_WORD)) != RED_MAGIC1)
1070 BUG();
1071 }
1072 #endif
1073 slab_bufctl(slabp)[i] = i+1;
1074 }
1075 slab_bufctl(slabp)[i-1] = BUFCTL_END;
1076 slabp->free = 0;
1077 }
1078
1079 /*
1080 * Grow (by 1) the number of slabs within a cache. This is called by
1081 * kmem_cache_alloc() when there are no active objs left in a cache.
1082 */
1083 static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
1084 {
1085 slab_t *slabp;
1086 struct page *page;
1087 void *objp;
1088 size_t offset;
1089 unsigned int i, local_flags;
1090 unsigned long ctor_flags;
1091 unsigned long save_flags;
1092
1093 /* Be lazy and only check for valid flags here,
1094 * keeping it out of the critical path in kmem_cache_alloc().
1095 */
1096 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1097 BUG();
1098 if (flags & SLAB_NO_GROW)
1099 return 0;
1100
1101 /*
1102 * The test for missing atomic flag is performed here, rather than
1103 * the more obvious place, simply to reduce the critical path length
1104 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
1105 * will eventually be caught here (where it matters).
1106 */
1107 if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
1108 BUG();
1109
1110 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1111 local_flags = (flags & SLAB_LEVEL_MASK);
1112 if (local_flags == SLAB_ATOMIC)
1113 /*
1114 * Not allowed to sleep. Need to tell a constructor about
1115 * this - it might need to know...
1116 */
1117 ctor_flags |= SLAB_CTOR_ATOMIC;
1118
1119 /* About to mess with non-constant members - lock. */
1120 spin_lock_irqsave(&cachep->spinlock, save_flags);
1121
1122 /* Get colour for the slab, and cal the next value. */
1123 offset = cachep->colour_next;
1124 cachep->colour_next++;
1125 if (cachep->colour_next >= cachep->colour)
1126 cachep->colour_next = 0;
1127 offset *= cachep->colour_off;
1128 cachep->dflags |= DFLGS_GROWN;
1129
1130 cachep->growing++;
1131 spin_unlock_irqrestore(&cachep->spinlock, save_flags);
1132
1133 /* A series of memory allocations for a new slab.
1134 * Neither the cache-chain semaphore, or cache-lock, are
1135 * held, but the incrementing c_growing prevents this
1136 * cache from being reaped or shrunk.
1137 * Note: The cache could be selected in for reaping in
1138 * kmem_cache_reap(), but when the final test is made the
1139 * growing value will be seen.
1140 */
1141
1142 /* Get mem for the objs. */
1143 if (!(objp = kmem_getpages(cachep, flags)))
1144 goto failed;
1145
1146 /* Get slab management. */
1147 if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags)))
1148 goto opps1;
1149
1150 /* Nasty!!!!!! I hope this is OK. */
1151 i = 1 << cachep->gfporder;
1152 page = virt_to_page(objp);
1153 do {
1154 SET_PAGE_CACHE(page, cachep);
1155 SET_PAGE_SLAB(page, slabp);
1156 PageSetSlab(page);
1157 page++;
1158 } while (--i);
1159
1160 kmem_cache_init_objs(cachep, slabp, ctor_flags);
1161
1162 spin_lock_irqsave(&cachep->spinlock, save_flags);
1163 cachep->growing--;
1164
1165 /* Make slab active. */
1166 list_add_tail(&slabp->list, &cachep->slabs_free);
1167 STATS_INC_GROWN(cachep);
1168 cachep->failures = 0;
1169
1170 spin_unlock_irqrestore(&cachep->spinlock, save_flags);
1171 return 1;
1172 opps1:
1173 kmem_freepages(cachep, objp);
1174 failed:
1175 spin_lock_irqsave(&cachep->spinlock, save_flags);
1176 cachep->growing--;
1177 spin_unlock_irqrestore(&cachep->spinlock, save_flags);
1178 return 0;
1179 }
1180
1181 /*
1182 * Perform extra freeing checks:
1183 * - detect double free
1184 * - detect bad pointers.
1185 * Called with the cache-lock held.
1186 */
1187
1188 #if DEBUG
1189 static int kmem_extra_free_checks (kmem_cache_t * cachep,
1190 slab_t *slabp, void * objp)
1191 {
1192 int i;
1193 unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
1194
1195 if (objnr >= cachep->num)
1196 BUG();
1197 if (objp != slabp->s_mem + objnr*cachep->objsize)
1198 BUG();
1199
1200 /* Check slab's freelist to see if this obj is there. */
1201 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1202 if (i == objnr)
1203 BUG();
1204 }
1205 return 0;
1206 }
1207 #endif
1208
1209 static inline void kmem_cache_alloc_head(kmem_cache_t *cachep, int flags)
1210 {
1211 if (flags & SLAB_DMA) {
1212 if (!(cachep->gfpflags & GFP_DMA))
1213 BUG();
1214 } else {
1215 if (cachep->gfpflags & GFP_DMA)
1216 BUG();
1217 }
1218 }
1219
1220 static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
1221 slab_t *slabp)
1222 {
1223 void *objp;
1224
1225 STATS_INC_ALLOCED(cachep);
1226 STATS_INC_ACTIVE(cachep);
1227 STATS_SET_HIGH(cachep);
1228
1229 /* get obj pointer */
1230 slabp->inuse++;
1231 objp = slabp->s_mem + slabp->free*cachep->objsize;
1232 slabp->free=slab_bufctl(slabp)[slabp->free];
1233
1234 if (unlikely(slabp->free == BUFCTL_END)) {
1235 list_del(&slabp->list);
1236 list_add(&slabp->list, &cachep->slabs_full);
1237 }
1238 #if DEBUG
1239 if (cachep->flags & SLAB_POISON)
1240 if (kmem_check_poison_obj(cachep, objp))
1241 BUG();
1242 if (cachep->flags & SLAB_RED_ZONE) {
1243 /* Set alloc red-zone, and check old one. */
1244 if (xchg((unsigned long *)objp, RED_MAGIC2) !=
1245 RED_MAGIC1)
1246 BUG();
1247 if (xchg((unsigned long *)(objp+cachep->objsize -
1248 BYTES_PER_WORD), RED_MAGIC2) != RED_MAGIC1)
1249 BUG();
1250 objp += BYTES_PER_WORD;
1251 }
1252 #endif
1253 return objp;
1254 }
1255
1256 /*
1257 * Returns a ptr to an obj in the given cache.
1258 * caller must guarantee synchronization
1259 * #define for the goto optimization 8-)
1260 */
1261 #define kmem_cache_alloc_one(cachep) \
1262 ({ \
1263 struct list_head * slabs_partial, * entry; \
1264 slab_t *slabp; \
1265 \
1266 slabs_partial = &(cachep)->slabs_partial; \
1267 entry = slabs_partial->next; \
1268 if (unlikely(entry == slabs_partial)) { \
1269 struct list_head * slabs_free; \
1270 slabs_free = &(cachep)->slabs_free; \
1271 entry = slabs_free->next; \
1272 if (unlikely(entry == slabs_free)) \
1273 goto alloc_new_slab; \
1274 list_del(entry); \
1275 list_add(entry, slabs_partial); \
1276 } \
1277 \
1278 slabp = list_entry(entry, slab_t, list); \
1279 kmem_cache_alloc_one_tail(cachep, slabp); \
1280 })
1281
1282 #ifdef CONFIG_SMP
1283 void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags)
1284 {
1285 int batchcount = cachep->batchcount;
1286 cpucache_t* cc = cc_data(cachep);
1287
1288 spin_lock(&cachep->spinlock);
1289 while (batchcount--) {
1290 struct list_head * slabs_partial, * entry;
1291 slab_t *slabp;
1292 /* Get slab alloc is to come from. */
1293 slabs_partial = &(cachep)->slabs_partial;
1294 entry = slabs_partial->next;
1295 if (unlikely(entry == slabs_partial)) {
1296 struct list_head * slabs_free;
1297 slabs_free = &(cachep)->slabs_free;
1298 entry = slabs_free->next;
1299 if (unlikely(entry == slabs_free))
1300 break;
1301 list_del(entry);
1302 list_add(entry, slabs_partial);
1303 }
1304
1305 slabp = list_entry(entry, slab_t, list);
1306 cc_entry(cc)[cc->avail++] =
1307 kmem_cache_alloc_one_tail(cachep, slabp);
1308 }
1309 spin_unlock(&cachep->spinlock);
1310
1311 if (cc->avail)
1312 return cc_entry(cc)[--cc->avail];
1313 return NULL;
1314 }
1315 #endif
1316
1317 static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags)
1318 {
1319 unsigned long save_flags;
1320 void* objp;
1321
1322 kmem_cache_alloc_head(cachep, flags);
1323 try_again:
1324 local_irq_save(save_flags);
1325 #ifdef CONFIG_SMP
1326 {
1327 cpucache_t *cc = cc_data(cachep);
1328
1329 if (cc) {
1330 if (cc->avail) {
1331 STATS_INC_ALLOCHIT(cachep);
1332 objp = cc_entry(cc)[--cc->avail];
1333 } else {
1334 STATS_INC_ALLOCMISS(cachep);
1335 objp = kmem_cache_alloc_batch(cachep,flags);
1336 if (!objp)
1337 goto alloc_new_slab_nolock;
1338 }
1339 } else {
1340 spin_lock(&cachep->spinlock);
1341 objp = kmem_cache_alloc_one(cachep);
1342 spin_unlock(&cachep->spinlock);
1343 }
1344 }
1345 #else
1346 objp = kmem_cache_alloc_one(cachep);
1347 #endif
1348 local_irq_restore(save_flags);
1349 return objp;
1350 alloc_new_slab:
1351 #ifdef CONFIG_SMP
1352 spin_unlock(&cachep->spinlock);
1353 alloc_new_slab_nolock:
1354 #endif
1355 local_irq_restore(save_flags);
1356 if (kmem_cache_grow(cachep, flags))
1357 /* Someone may have stolen our objs. Doesn't matter, we'll
1358 * just come back here again.
1359 */
1360 goto try_again;
1361 return NULL;
1362 }
1363
1364 /*
1365 * Release an obj back to its cache. If the obj has a constructed
1366 * state, it should be in this state _before_ it is released.
1367 * - caller is responsible for the synchronization
1368 */
1369
1370 #if DEBUG
1371 # define CHECK_NR(pg) \
1372 do { \
1373 if (!VALID_PAGE(pg)) { \
1374 printk(KERN_ERR "kfree: out of range ptr %lxh.\n", \
1375 (unsigned long)objp); \
1376 BUG(); \
1377 } \
1378 } while (0)
1379 # define CHECK_PAGE(page) \
1380 do { \
1381 CHECK_NR(page); \
1382 if (!PageSlab(page)) { \
1383 printk(KERN_ERR "kfree: bad ptr %lxh.\n", \
1384 (unsigned long)objp); \
1385 BUG(); \
1386 } \
1387 } while (0)
1388
1389 #else
1390 # define CHECK_PAGE(pg) do { } while (0)
1391 #endif
1392
1393 static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
1394 {
1395 slab_t* slabp;
1396
1397 CHECK_PAGE(virt_to_page(objp));
1398 /* reduces memory footprint
1399 *
1400 if (OPTIMIZE(cachep))
1401 slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1)));
1402 else
1403 */
1404 slabp = GET_PAGE_SLAB(virt_to_page(objp));
1405
1406 #if DEBUG
1407 if (cachep->flags & SLAB_DEBUG_INITIAL)
1408 /* Need to call the slab's constructor so the
1409 * caller can perform a verify of its state (debugging).
1410 * Called without the cache-lock held.
1411 */
1412 cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1413
1414 if (cachep->flags & SLAB_RED_ZONE) {
1415 objp -= BYTES_PER_WORD;
1416 if (xchg((unsigned long *)objp, RED_MAGIC1) != RED_MAGIC2)
1417 /* Either write before start, or a double free. */
1418 BUG();
1419 if (xchg((unsigned long *)(objp+cachep->objsize -
1420 BYTES_PER_WORD), RED_MAGIC1) != RED_MAGIC2)
1421 /* Either write past end, or a double free. */
1422 BUG();
1423 }
1424 if (cachep->flags & SLAB_POISON)
1425 kmem_poison_obj(cachep, objp);
1426 if (kmem_extra_free_checks(cachep, slabp, objp))
1427 return;
1428 #endif
1429 {
1430 unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
1431
1432 slab_bufctl(slabp)[objnr] = slabp->free;
1433 slabp->free = objnr;
1434 }
1435 STATS_DEC_ACTIVE(cachep);
1436
1437 /* fixup slab chains */
1438 {
1439 int inuse = slabp->inuse;
1440 if (unlikely(!--slabp->inuse)) {
1441 /* Was partial or full, now empty. */
1442 list_del(&slabp->list);
1443 list_add(&slabp->list, &cachep->slabs_free);
1444 } else if (unlikely(inuse == cachep->num)) {
1445 /* Was full. */
1446 list_del(&slabp->list);
1447 list_add(&slabp->list, &cachep->slabs_partial);
1448 }
1449 }
1450 }
1451
1452 #ifdef CONFIG_SMP
1453 static inline void __free_block (kmem_cache_t* cachep,
1454 void** objpp, int len)
1455 {
1456 for ( ; len > 0; len--, objpp++)
1457 kmem_cache_free_one(cachep, *objpp);
1458 }
1459
1460 static void free_block (kmem_cache_t* cachep, void** objpp, int len)
1461 {
1462 spin_lock(&cachep->spinlock);
1463 __free_block(cachep, objpp, len);
1464 spin_unlock(&cachep->spinlock);
1465 }
1466 #endif
1467
1468 /*
1469 * __kmem_cache_free
1470 * called with disabled ints
1471 */
1472 static inline void __kmem_cache_free (kmem_cache_t *cachep, void* objp)
1473 {
1474 #ifdef CONFIG_SMP
1475 cpucache_t *cc = cc_data(cachep);
1476
1477 CHECK_PAGE(virt_to_page(objp));
1478 if (cc) {
1479 int batchcount;
1480 if (cc->avail < cc->limit) {
1481 STATS_INC_FREEHIT(cachep);
1482 cc_entry(cc)[cc->avail++] = objp;
1483 return;
1484 }
1485 STATS_INC_FREEMISS(cachep);
1486 batchcount = cachep->batchcount;
1487 cc->avail -= batchcount;
1488 free_block(cachep,
1489 &cc_entry(cc)[cc->avail],batchcount);
1490 cc_entry(cc)[cc->avail++] = objp;
1491 return;
1492 } else {
1493 free_block(cachep, &objp, 1);
1494 }
1495 #else
1496 kmem_cache_free_one(cachep, objp);
1497 #endif
1498 }
1499
1500 /**
1501 * kmem_cache_alloc - Allocate an object
1502 * @cachep: The cache to allocate from.
1503 * @flags: See kmalloc().
1504 *
1505 * Allocate an object from this cache. The flags are only relevant
1506 * if the cache has no available objects.
1507 */
1508 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
1509 {
1510 return __kmem_cache_alloc(cachep, flags);
1511 }
1512
1513 /**
1514 * kmalloc - allocate memory
1515 * @size: how many bytes of memory are required.
1516 * @flags: the type of memory to allocate.
1517 *
1518 * kmalloc is the normal method of allocating memory
1519 * in the kernel.
1520 *
1521 * The @flags argument may be one of:
1522 *
1523 * %GFP_USER - Allocate memory on behalf of user. May sleep.
1524 *
1525 * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
1526 *
1527 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
1528 *
1529 * Additionally, the %GFP_DMA flag may be set to indicate the memory
1530 * must be suitable for DMA. This can mean different things on different
1531 * platforms. For example, on i386, it means that the memory must come
1532 * from the first 16MB.
1533 */
1534 void * kmalloc (size_t size, int flags)
1535 {
1536 cache_sizes_t *csizep = cache_sizes;
1537
1538 for (; csizep->cs_size; csizep++) {
1539 if (size > csizep->cs_size)
1540 continue;
1541 return __kmem_cache_alloc(flags & GFP_DMA ?
1542 csizep->cs_dmacachep : csizep->cs_cachep, flags);
1543 }
1544 return NULL;
1545 }
1546
1547 /**
1548 * kmem_cache_free - Deallocate an object
1549 * @cachep: The cache the allocation was from.
1550 * @objp: The previously allocated object.
1551 *
1552 * Free an object which was previously allocated from this
1553 * cache.
1554 */
1555 void kmem_cache_free (kmem_cache_t *cachep, void *objp)
1556 {
1557 unsigned long flags;
1558 #if DEBUG
1559 CHECK_PAGE(virt_to_page(objp));
1560 if (cachep != GET_PAGE_CACHE(virt_to_page(objp)))
1561 BUG();
1562 #endif
1563
1564 local_irq_save(flags);
1565 __kmem_cache_free(cachep, objp);
1566 local_irq_restore(flags);
1567 }
1568
1569 /**
1570 * kfree - free previously allocated memory
1571 * @objp: pointer returned by kmalloc.
1572 *
1573 * Don't free memory not originally allocated by kmalloc()
1574 * or you will run into trouble.
1575 */
1576 void kfree (const void *objp)
1577 {
1578 kmem_cache_t *c;
1579 unsigned long flags;
1580
1581 if (!objp)
1582 return;
1583 local_irq_save(flags);
1584 CHECK_PAGE(virt_to_page(objp));
1585 c = GET_PAGE_CACHE(virt_to_page(objp));
1586 __kmem_cache_free(c, (void*)objp);
1587 local_irq_restore(flags);
1588 }
1589
1590 kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
1591 {
1592 cache_sizes_t *csizep = cache_sizes;
1593
1594 /* This function could be moved to the header file, and
1595 * made inline so consumers can quickly determine what
1596 * cache pointer they require.
1597 */
1598 for ( ; csizep->cs_size; csizep++) {
1599 if (size > csizep->cs_size)
1600 continue;
1601 break;
1602 }
1603 return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep;
1604 }
1605
1606 #ifdef CONFIG_SMP
1607
1608 /* called with cache_chain_sem acquired. */
1609 static int kmem_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount)
1610 {
1611 ccupdate_struct_t new;
1612 int i;
1613
1614 /*
1615 * These are admin-provided, so we are more graceful.
1616 */
1617 if (limit < 0)
1618 return -EINVAL;
1619 if (batchcount < 0)
1620 return -EINVAL;
1621 if (batchcount > limit)
1622 return -EINVAL;
1623 if (limit != 0 && !batchcount)
1624 return -EINVAL;
1625
1626 memset(&new.new,0,sizeof(new.new));
1627 if (limit) {
1628 for (i = 0; i< smp_num_cpus; i++) {
1629 cpucache_t* ccnew;
1630
1631 ccnew = kmalloc(sizeof(void*)*limit+
1632 sizeof(cpucache_t), GFP_KERNEL);
1633 if (!ccnew)
1634 goto oom;
1635 ccnew->limit = limit;
1636 ccnew->avail = 0;
1637 new.new[cpu_logical_map(i)] = ccnew;
1638 }
1639 }
1640 new.cachep = cachep;
1641 spin_lock_irq(&cachep->spinlock);
1642 cachep->batchcount = batchcount;
1643 spin_unlock_irq(&cachep->spinlock);
1644
1645 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
1646
1647 for (i = 0; i < smp_num_cpus; i++) {
1648 cpucache_t* ccold = new.new[cpu_logical_map(i)];
1649 if (!ccold)
1650 continue;
1651 local_irq_disable();
1652 free_block(cachep, cc_entry(ccold), ccold->avail);
1653 local_irq_enable();
1654 kfree(ccold);
1655 }
1656 return 0;
1657 oom:
1658 for (i--; i >= 0; i--)
1659 kfree(new.new[cpu_logical_map(i)]);
1660 return -ENOMEM;
1661 }
1662
1663 static void enable_cpucache (kmem_cache_t *cachep)
1664 {
1665 int err;
1666 int limit;
1667
1668 /* FIXME: optimize */
1669 if (cachep->objsize > PAGE_SIZE)
1670 return;
1671 if (cachep->objsize > 1024)
1672 limit = 60;
1673 else if (cachep->objsize > 256)
1674 limit = 124;
1675 else
1676 limit = 252;
1677
1678 err = kmem_tune_cpucache(cachep, limit, limit/2);
1679 if (err)
1680 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
1681 cachep->name, -err);
1682 }
1683
1684 static void enable_all_cpucaches (void)
1685 {
1686 struct list_head* p;
1687
1688 down(&cache_chain_sem);
1689
1690 p = &cache_cache.next;
1691 do {
1692 kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
1693
1694 enable_cpucache(cachep);
1695 p = cachep->next.next;
1696 } while (p != &cache_cache.next);
1697
1698 up(&cache_chain_sem);
1699 }
1700 #endif
1701
1702 /**
1703 * kmem_cache_reap - Reclaim memory from caches.
1704 * @gfp_mask: the type of memory required.
1705 *
1706 * Called from do_try_to_free_pages() and __alloc_pages()
1707 */
1708 int kmem_cache_reap (int gfp_mask)
1709 {
1710 slab_t *slabp;
1711 kmem_cache_t *searchp;
1712 kmem_cache_t *best_cachep;
1713 unsigned int best_pages;
1714 unsigned int best_len;
1715 unsigned int scan;
1716 int ret = 0;
1717
1718 if (gfp_mask & __GFP_WAIT)
1719 down(&cache_chain_sem);
1720 else
1721 if (down_trylock(&cache_chain_sem))
1722 return 0;
1723
1724 scan = REAP_SCANLEN;
1725 best_len = 0;
1726 best_pages = 0;
1727 best_cachep = NULL;
1728 searchp = clock_searchp;
1729 do {
1730 unsigned int pages;
1731 struct list_head* p;
1732 unsigned int full_free;
1733
1734 /* It's safe to test this without holding the cache-lock. */
1735 if (searchp->flags & SLAB_NO_REAP)
1736 goto next;
1737 spin_lock_irq(&searchp->spinlock);
1738 if (searchp->growing)
1739 goto next_unlock;
1740 if (searchp->dflags & DFLGS_GROWN) {
1741 searchp->dflags &= ~DFLGS_GROWN;
1742 goto next_unlock;
1743 }
1744 #ifdef CONFIG_SMP
1745 {
1746 cpucache_t *cc = cc_data(searchp);
1747 if (cc && cc->avail) {
1748 __free_block(searchp, cc_entry(cc), cc->avail);
1749 cc->avail = 0;
1750 }
1751 }
1752 #endif
1753
1754 full_free = 0;
1755 p = searchp->slabs_free.next;
1756 while (p != &searchp->slabs_free) {
1757 slabp = list_entry(p, slab_t, list);
1758 #if DEBUG
1759 if (slabp->inuse)
1760 BUG();
1761 #endif
1762 full_free++;
1763 p = p->next;
1764 }
1765
1766 /*
1767 * Try to avoid slabs with constructors and/or
1768 * more than one page per slab (as it can be difficult
1769 * to get high orders from gfp()).
1770 */
1771 pages = full_free * (1<<searchp->gfporder);
1772 if (searchp->ctor)
1773 pages = (pages*4+1)/5;
1774 if (searchp->gfporder)
1775 pages = (pages*4+1)/5;
1776 if (pages > best_pages) {
1777 best_cachep = searchp;
1778 best_len = full_free;
1779 best_pages = pages;
1780 if (pages >= REAP_PERFECT) {
1781 clock_searchp = list_entry(searchp->next.next,
1782 kmem_cache_t,next);
1783 goto perfect;
1784 }
1785 }
1786 next_unlock:
1787 spin_unlock_irq(&searchp->spinlock);
1788 next:
1789 searchp = list_entry(searchp->next.next,kmem_cache_t,next);
1790 } while (--scan && searchp != clock_searchp);
1791
1792 clock_searchp = searchp;
1793
1794 if (!best_cachep)
1795 /* couldn't find anything to reap */
1796 goto out;
1797
1798 spin_lock_irq(&best_cachep->spinlock);
1799 perfect:
1800 /* free only 50% of the free slabs */
1801 best_len = (best_len + 1)/2;
1802 for (scan = 0; scan < best_len; scan++) {
1803 struct list_head *p;
1804
1805 if (best_cachep->growing)
1806 break;
1807 p = best_cachep->slabs_free.prev;
1808 if (p == &best_cachep->slabs_free)
1809 break;
1810 slabp = list_entry(p,slab_t,list);
1811 #if DEBUG
1812 if (slabp->inuse)
1813 BUG();
1814 #endif
1815 list_del(&slabp->list);
1816 STATS_INC_REAPED(best_cachep);
1817
1818 /* Safe to drop the lock. The slab is no longer linked to the
1819 * cache.
1820 */
1821 spin_unlock_irq(&best_cachep->spinlock);
1822 kmem_slab_destroy(best_cachep, slabp);
1823 spin_lock_irq(&best_cachep->spinlock);
1824 }
1825 spin_unlock_irq(&best_cachep->spinlock);
1826 ret = scan * (1 << best_cachep->gfporder);
1827 out:
1828 up(&cache_chain_sem);
1829 return ret;
1830 }
1831
1832 #ifdef CONFIG_PROC_FS
1833 /* /proc/slabinfo
1834 * cache-name num-active-objs total-objs
1835 * obj-size num-active-slabs total-slabs
1836 * num-pages-per-slab
1837 */
1838 #define FIXUP(t) \
1839 do { \
1840 if (len <= off) { \
1841 off -= len; \
1842 len = 0; \
1843 } else { \
1844 if (len-off > count) \
1845 goto t; \
1846 } \
1847 } while (0)
1848
1849 static int proc_getdata (char*page, char**start, off_t off, int count)
1850 {
1851 struct list_head *p;
1852 int len = 0;
1853
1854 /* Output format version, so at least we can change it without _too_
1855 * many complaints.
1856 */
1857 len += sprintf(page+len, "slabinfo - version: 1.1"
1858 #if STATS
1859 " (statistics)"
1860 #endif
1861 #ifdef CONFIG_SMP
1862 " (SMP)"
1863 #endif
1864 "\n");
1865 FIXUP(got_data);
1866
1867 down(&cache_chain_sem);
1868 p = &cache_cache.next;
1869 do {
1870 kmem_cache_t *cachep;
1871 struct list_head *q;
1872 slab_t *slabp;
1873 unsigned long active_objs;
1874 unsigned long num_objs;
1875 unsigned long active_slabs = 0;
1876 unsigned long num_slabs;
1877 cachep = list_entry(p, kmem_cache_t, next);
1878
1879 spin_lock_irq(&cachep->spinlock);
1880 active_objs = 0;
1881 num_slabs = 0;
1882 list_for_each(q,&cachep->slabs_full) {
1883 slabp = list_entry(q, slab_t, list);
1884 if (slabp->inuse != cachep->num)
1885 BUG();
1886 active_objs += cachep->num;
1887 active_slabs++;
1888 }
1889 list_for_each(q,&cachep->slabs_partial) {
1890 slabp = list_entry(q, slab_t, list);
1891 if (slabp->inuse == cachep->num || !slabp->inuse)
1892 BUG();
1893 active_objs += slabp->inuse;
1894 active_slabs++;
1895 }
1896 list_for_each(q,&cachep->slabs_free) {
1897 slabp = list_entry(q, slab_t, list);
1898 if (slabp->inuse)
1899 BUG();
1900 num_slabs++;
1901 }
1902 num_slabs+=active_slabs;
1903 num_objs = num_slabs*cachep->num;
1904
1905 len += sprintf(page+len, "%-17s %6lu %6lu %6u %4lu %4lu %4u",
1906 cachep->name, active_objs, num_objs, cachep->objsize,
1907 active_slabs, num_slabs, (1<<cachep->gfporder));
1908
1909 #if STATS
1910 {
1911 unsigned long errors = cachep->errors;
1912 unsigned long high = cachep->high_mark;
1913 unsigned long grown = cachep->grown;
1914 unsigned long reaped = cachep->reaped;
1915 unsigned long allocs = cachep->num_allocations;
1916
1917 len += sprintf(page+len, " : %6lu %7lu %5lu %4lu %4lu",
1918 high, allocs, grown, reaped, errors);
1919 }
1920 #endif
1921 #ifdef CONFIG_SMP
1922 {
1923 unsigned int batchcount = cachep->batchcount;
1924 unsigned int limit;
1925
1926 if (cc_data(cachep))
1927 limit = cc_data(cachep)->limit;
1928 else
1929 limit = 0;
1930 len += sprintf(page+len, " : %4u %4u",
1931 limit, batchcount);
1932 }
1933 #endif
1934 #if STATS && defined(CONFIG_SMP)
1935 {
1936 unsigned long allochit = atomic_read(&cachep->allochit);
1937 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
1938 unsigned long freehit = atomic_read(&cachep->freehit);
1939 unsigned long freemiss = atomic_read(&cachep->freemiss);
1940 len += sprintf(page+len, " : %6lu %6lu %6lu %6lu",
1941 allochit, allocmiss, freehit, freemiss);
1942 }
1943 #endif
1944 len += sprintf(page+len,"\n");
1945 spin_unlock_irq(&cachep->spinlock);
1946 FIXUP(got_data_up);
1947 p = cachep->next.next;
1948 } while (p != &cache_cache.next);
1949 got_data_up:
1950 up(&cache_chain_sem);
1951
1952 got_data:
1953 *start = page+off;
1954 return len;
1955 }
1956
1957 /**
1958 * slabinfo_read_proc - generates /proc/slabinfo
1959 * @page: scratch area, one page long
1960 * @start: pointer to the pointer to the output buffer
1961 * @off: offset within /proc/slabinfo the caller is interested in
1962 * @count: requested len in bytes
1963 * @eof: eof marker
1964 * @data: unused
1965 *
1966 * The contents of the buffer are
1967 * cache-name
1968 * num-active-objs
1969 * total-objs
1970 * object size
1971 * num-active-slabs
1972 * total-slabs
1973 * num-pages-per-slab
1974 * + further values on SMP and with statistics enabled
1975 */
1976 int slabinfo_read_proc (char *page, char **start, off_t off,
1977 int count, int *eof, void *data)
1978 {
1979 int len = proc_getdata(page, start, off, count);
1980 len -= (*start-page);
1981 if (len <= count)
1982 *eof = 1;
1983 if (len>count) len = count;
1984 if (len<0) len = 0;
1985 return len;
1986 }
1987
1988 #define MAX_SLABINFO_WRITE 128
1989 /**
1990 * slabinfo_write_proc - SMP tuning for the slab allocator
1991 * @file: unused
1992 * @buffer: user buffer
1993 * @count: data len
1994 * @data: unused
1995 */
1996 int slabinfo_write_proc (struct file *file, const char *buffer,
1997 unsigned long count, void *data)
1998 {
1999 #ifdef CONFIG_SMP
2000 char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
2001 int limit, batchcount, res;
2002 struct list_head *p;
2003
2004 if (count > MAX_SLABINFO_WRITE)
2005 return -EINVAL;
2006 if (copy_from_user(&kbuf, buffer, count))
2007 return -EFAULT;
2008 kbuf[MAX_SLABINFO_WRITE] = '\0';
2009
2010 tmp = strchr(kbuf, ' ');
2011 if (!tmp)
2012 return -EINVAL;
2013 *tmp = '\0';
2014 tmp++;
2015 limit = simple_strtol(tmp, &tmp, 10);
2016 while (*tmp == ' ')
2017 tmp++;
2018 batchcount = simple_strtol(tmp, &tmp, 10);
2019
2020 /* Find the cache in the chain of caches. */
2021 down(&cache_chain_sem);
2022 res = -EINVAL;
2023 list_for_each(p,&cache_chain) {
2024 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
2025
2026 if (!strcmp(cachep->name, kbuf)) {
2027 res = kmem_tune_cpucache(cachep, limit, batchcount);
2028 break;
2029 }
2030 }
2031 up(&cache_chain_sem);
2032 if (res >= 0)
2033 res = count;
2034 return res;
2035 #else
2036 return -EINVAL;
2037 #endif
2038 }
2039 #endif
2040