Index: linux-2.6.10/include/asm-i386/mmx.h =================================================================== --- linux-2.6.10.orig/include/asm-i386/mmx.h 2004-12-24 13:34:57.000000000 -0800 +++ linux-2.6.10/include/asm-i386/mmx.h 2005-02-07 11:17:24.000000000 -0800 @@ -8,7 +8,7 @@ #include extern void *_mmx_memcpy(void *to, const void *from, size_t size); -extern void mmx_clear_page(void *page); +extern void mmx_clear_page(void *page, int order); extern void mmx_copy_page(void *to, void *from); #endif Index: linux-2.6.10/drivers/base/node.c =================================================================== --- linux-2.6.10.orig/drivers/base/node.c 2005-02-03 22:50:21.000000000 -0800 +++ linux-2.6.10/drivers/base/node.c 2005-02-07 11:17:27.000000000 -0800 @@ -42,13 +42,15 @@ static ssize_t node_read_meminfo(struct unsigned long inactive; unsigned long active; unsigned long free; + unsigned long zero; si_meminfo_node(&i, nid); - __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid)); + __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid)); n = sprintf(buf, "\n" "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" + "Node %d MemZero: %8lu kB\n" "Node %d MemUsed: %8lu kB\n" "Node %d Active: %8lu kB\n" "Node %d Inactive: %8lu kB\n" @@ -58,6 +60,7 @@ static ssize_t node_read_meminfo(struct "Node %d LowFree: %8lu kB\n", nid, K(i.totalram), nid, K(i.freeram), + nid, K(zero), nid, K(i.totalram - i.freeram), nid, K(active), nid, K(inactive), Index: linux-2.6.10/arch/i386/lib/mmx.c =================================================================== --- linux-2.6.10.orig/arch/i386/lib/mmx.c 2004-12-24 13:34:48.000000000 -0800 +++ linux-2.6.10/arch/i386/lib/mmx.c 2005-02-07 11:17:24.000000000 -0800 @@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void * * other MMX using processors do not. */ -static void fast_clear_page(void *page) +static void fast_clear_page(void *page, int order) { int i; @@ -138,7 +138,7 @@ static void fast_clear_page(void *page) " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/64;i++) + for(i=0;i<((4096/64) << order);i++) { __asm__ __volatile__ ( " movntq %%mm0, (%0)\n" @@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi * Generic MMX implementation without K7 specific streaming */ -static void fast_clear_page(void *page) +static void fast_clear_page(void *page, int order) { int i; @@ -267,7 +267,7 @@ static void fast_clear_page(void *page) " pxor %%mm0, %%mm0\n" : : ); - for(i=0;i<4096/128;i++) + for(i=0;i<((4096/128) << order);i++) { __asm__ __volatile__ ( " movq %%mm0, (%0)\n" @@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi * Favour MMX for page clear and copy. */ -static void slow_zero_page(void * page) +static void slow_clear_page(void * page, int order) { int d0, d1; __asm__ __volatile__( \ "cld\n\t" \ "rep ; stosl" \ : "=&c" (d0), "=&D" (d1) - :"a" (0),"1" (page),"0" (1024) + :"a" (0),"1" (page),"0" (1024 << order) :"memory"); } - -void mmx_clear_page(void * page) + +void mmx_clear_page(void * page, int order) { if(unlikely(in_interrupt())) - slow_zero_page(page); + slow_clear_page(page, order); else - fast_clear_page(page); + fast_clear_page(page, order); } static void slow_copy_page(void *to, void *from) Index: linux-2.6.10/include/linux/sched.h =================================================================== --- linux-2.6.10.orig/include/linux/sched.h 2005-02-03 22:51:50.000000000 -0800 +++ linux-2.6.10/include/linux/sched.h 2005-02-07 11:17:27.000000000 -0800 @@ -735,6 +735,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ +#define PF_KSCRUBD 0x00800000 /* I am kscrubd */ /* * Only the _current_ task can read/write to tsk->flags, but other Index: linux-2.6.10/include/linux/gfp.h =================================================================== --- linux-2.6.10.orig/include/linux/gfp.h 2005-02-03 22:51:46.000000000 -0800 +++ linux-2.6.10/include/linux/gfp.h 2005-02-07 11:17:24.000000000 -0800 @@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru void page_alloc_init(void); +void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags); #endif /* __LINUX_GFP_H */ Index: linux-2.6.10/fs/proc/proc_misc.c =================================================================== --- linux-2.6.10.orig/fs/proc/proc_misc.c 2005-02-03 22:51:20.000000000 -0800 +++ linux-2.6.10/fs/proc/proc_misc.c 2005-02-07 11:17:27.000000000 -0800 @@ -123,12 +123,13 @@ static int meminfo_read_proc(char *page, unsigned long inactive; unsigned long active; unsigned long free; + unsigned long zero; unsigned long committed; unsigned long allowed; struct vmalloc_info vmi; get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); + get_zone_counts(&active, &inactive, &free, &zero); /* * display in kilobytes. @@ -148,6 +149,7 @@ static int meminfo_read_proc(char *page, len = sprintf(page, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemZero: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -171,6 +173,7 @@ static int meminfo_read_proc(char *page, "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), + K(zero), K(i.bufferram), K(get_page_cache_size()-total_swapcache_pages-i.bufferram), K(total_swapcache_pages), Index: linux-2.6.10/include/asm-ia64/page.h =================================================================== --- linux-2.6.10.orig/include/asm-ia64/page.h 2005-02-03 22:51:35.000000000 -0800 +++ linux-2.6.10/include/asm-ia64/page.h 2005-02-07 11:17:24.000000000 -0800 @@ -56,8 +56,10 @@ # ifdef __KERNEL__ # define STRICT_MM_TYPECHECKS -extern void clear_page (void *page); +extern void clear_pages (void *page, int order); extern void copy_page (void *to, void *from); +#define clear_page(__page) clear_pages(__page, 0) +#define __HAVE_ARCH_CLEAR_PAGES /* * clear_user_page() and copy_user_page() can't be inline functions because Index: linux-2.6.10/mm/hugetlb.c =================================================================== --- linux-2.6.10.orig/mm/hugetlb.c 2005-02-03 22:51:56.000000000 -0800 +++ linux-2.6.10/mm/hugetlb.c 2005-02-07 11:17:24.000000000 -0800 @@ -78,7 +78,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(void) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(); @@ -89,8 +88,7 @@ struct page *alloc_huge_page(void) spin_unlock(&hugetlb_lock); set_page_count(page, 1); page[1].mapping = (void *)free_huge_page; - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_highpage(&page[i]); + prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER); return page; } Index: linux-2.6.10/arch/ia64/lib/clear_page.S =================================================================== --- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800 +++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-02-07 11:17:24.000000000 -0800 @@ -7,6 +7,7 @@ * 1/06/01 davidm Tuned for Itanium. * 2/12/02 kchen Tuned for both Itanium and McKinley * 3/08/02 davidm Some more tweaking + * 12/10/04 clameter Make it work on pages of order size */ #include @@ -29,27 +30,33 @@ #define dst4 r11 #define dst_last r31 +#define totsize r14 -GLOBAL_ENTRY(clear_page) +GLOBAL_ENTRY(clear_pages) .prologue - .regstk 1,0,0,0 - mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until + .regstk 2,0,0,0 + mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count + mov totsize = PAGE_SIZE .save ar.lc, saved_lc mov saved_lc = ar.lc - + ;; .body + adds dst1 = 16, in0 mov ar.lc = (PREFETCH_LINES - 1) mov dst_fetch = in0 - adds dst1 = 16, in0 adds dst2 = 32, in0 + shl r16 = r16, in1 + shl totsize = totsize, in1 ;; .fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE adds dst3 = 48, in0 // executing this multiple times is harmless br.cloop.sptk.few .fetch + add r16 = -1,r16 + add dst_last = totsize, dst_fetch + adds dst4 = 64, in0 ;; - addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch mov ar.lc = r16 // one L3 line per iteration - adds dst4 = 64, in0 + adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last ;; #ifdef CONFIG_ITANIUM // Optimized for Itanium @@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page) ;; mov ar.lc = saved_lc // restore lc br.ret.sptk.many rp -END(clear_page) +END(clear_pages) Index: linux-2.6.10/mm/page_alloc.c =================================================================== --- linux-2.6.10.orig/mm/page_alloc.c 2005-02-03 22:51:57.000000000 -0800 +++ linux-2.6.10/mm/page_alloc.c 2005-02-07 11:17:27.000000000 -0800 @@ -12,6 +12,8 @@ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + * Page zeroing by Christoph Lameter, SGI, Dec 2004 using + * initial code for __GFP_ZERO support by Andrea Arcangeli, Oct 2004. */ #include @@ -33,6 +35,7 @@ #include #include #include +#include #include #include "internal.h" @@ -175,16 +178,16 @@ static void destroy_compound_page(struct * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_zorder(struct page *page) { return page->private; } -static inline void set_page_order(struct page *page, int order) { - page->private = order; +static inline void set_page_zorder(struct page *page, int order, int zero) { + page->private = order + (zero << 10); __SetPagePrivate(page); } -static inline void rmv_page_order(struct page *page) +static inline void rmv_page_zorder(struct page *page) { __ClearPagePrivate(page); page->private = 0; @@ -195,14 +198,15 @@ static inline void rmv_page_order(struct * we can do coalesce a page and its buddy if * (a) the buddy is free && * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (c) a page and its buddy have the same order and the same + * zeroing status. * for recording page's order, we use page->private and PG_private. * */ -static inline int page_is_buddy(struct page *page, int order) +static inline int page_is_buddy(struct page *page, int order, int zero) { if (PagePrivate(page) && - (page_order(page) == order) && + (page_zorder(page) == order + (zero << 10)) && !PageReserved(page) && page_count(page) == 0) return 1; @@ -233,22 +237,20 @@ static inline int page_is_buddy(struct p * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, - struct zone *zone, unsigned int order) +static inline int __free_pages_bulk (struct page *page, struct page *base, + struct zone *zone, unsigned int order, int zero) { unsigned long page_idx; struct page *coalesced; - int order_size = 1 << order; if (unlikely(order)) destroy_compound_page(page, order); page_idx = page - base; - BUG_ON(page_idx & (order_size - 1)); + BUG_ON(page_idx & (( 1 << order) - 1)); BUG_ON(bad_range(zone, page)); - zone->free_pages += order_size; while (order < MAX_ORDER-1) { struct free_area *area; struct page *buddy; @@ -258,20 +260,21 @@ static inline void __free_pages_bulk (st buddy = base + buddy_idx; if (bad_range(zone, buddy)) break; - if (!page_is_buddy(buddy, order)) + if (!page_is_buddy(buddy, order, zero)) break; /* Move the buddy up one level. */ list_del(&buddy->lru); - area = zone->free_area + order; + area = zone->free_area[zero] + order; area->nr_free--; - rmv_page_order(buddy); + rmv_page_zorder(buddy); page_idx &= buddy_idx; order++; } coalesced = base + page_idx; - set_page_order(coalesced, order); - list_add(&coalesced->lru, &zone->free_area[order].free_list); - zone->free_area[order].nr_free++; + set_page_zorder(coalesced, order, zero); + list_add(&coalesced->lru, &zone->free_area[zero][order].free_list); + zone->free_area[zero][order].nr_free++; + return order; } static inline void free_pages_check(const char *function, struct page *page) @@ -320,8 +323,11 @@ free_pages_bulk(struct zone *zone, int c page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, base, zone, order); + if (__free_pages_bulk(page, base, zone, order, NOT_ZEROED) + >= sysctl_scrub_start) + wakeup_kscrubd(zone); ret++; + zone->free_pages += 1UL << order; } spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -349,6 +355,18 @@ void __free_pages_ok(struct page *page, free_pages_bulk(page_zone(page), 1, &list, order); } +void end_zero_page(struct page *page, unsigned int order) +{ + unsigned long flags; + struct zone * zone = page_zone(page); + + spin_lock_irqsave(&zone->lock, flags); + + __free_pages_bulk(page, zone->zone_mem_map, zone, order, ZEROED); + zone->zero_pages += 1UL << order; + + spin_unlock_irqrestore(&zone->lock, flags); +} /* * The order of subdivision here is critical for the IO subsystem. @@ -366,7 +384,7 @@ void __free_pages_ok(struct page *page, */ static inline struct page * expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area) + int low, int high, struct free_area *area, int zero) { unsigned long size = 1 << high; @@ -377,7 +395,7 @@ expand(struct zone *zone, struct page *p BUG_ON(bad_range(zone, &page[size])); list_add(&page[size].lru, &area->free_list); area->nr_free++; - set_page_order(&page[size], high); + set_page_zorder(&page[size], high, zero); } return page; } @@ -428,23 +446,44 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static void inline rmpage(struct page *page, struct free_area *area) +{ + list_del(&page->lru); + rmv_page_zorder(page); + area->nr_free--; +} + +struct page *scrubd_rmpage(struct zone *zone, struct free_area *area) +{ + unsigned long flags; + struct page *page = NULL; + + spin_lock_irqsave(&zone->lock, flags); + if (!list_empty(&area->free_list)) { + page = list_entry(area->free_list.next, struct page, lru); + rmpage(page, area); + } + spin_unlock_irqrestore(&zone->lock, flags); + return page; +} + +static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero) { - struct free_area * area; + struct free_area *area; unsigned int current_order; struct page *page; for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = zone->free_area + current_order; + area = zone->free_area[zero] + current_order; if (list_empty(&area->free_list)) continue; page = list_entry(area->free_list.next, struct page, lru); - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; + rmpage(page, zone->free_area[zero] + current_order); zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + if (zero) + zone->zero_pages -= 1UL << order; + return expand(zone, page, order, current_order, area, zero); } return NULL; @@ -456,7 +495,7 @@ static struct page *__rmqueue(struct zon * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) + unsigned long count, struct list_head *list, int zero) { unsigned long flags; int i; @@ -465,7 +504,7 @@ static int rmqueue_bulk(struct zone *zon spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, zero); if (page == NULL) break; allocated++; @@ -512,7 +551,7 @@ void mark_free_pages(struct zone *zone) ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); for (order = MAX_ORDER - 1; order >= 0; --order) - list_for_each(curr, &zone->free_area[order].free_list) { + list_for_each(curr, &zone->free_area[NOT_ZEROED][order].free_list) { unsigned long start_pfn, i; start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); @@ -599,11 +638,19 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags) { int i; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + +#ifdef __HAVE_ARCH_CLEAR_PAGES + if (!PageHighMem(page)) { + clear_pages(page_address(page), order); + return; + } +#endif + for(i = 0; i < (1 << order); i++) clear_highpage(page + i); } @@ -618,7 +665,9 @@ buffered_rmqueue(struct zone *zone, int { unsigned long flags; struct page *page = NULL; - int cold = !!(gfp_flags & __GFP_COLD); + int nr_pages = 1 << order; + int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages); + int cold = !!(gfp_flags & __GFP_COLD) + 2*zero; if (order == 0) { struct per_cpu_pages *pcp; @@ -627,7 +676,7 @@ buffered_rmqueue(struct zone *zone, int local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + pcp->batch, &pcp->list, zero); if (pcp->count) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); @@ -639,16 +688,25 @@ buffered_rmqueue(struct zone *zone, int if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, zero); + /* + * If we failed to obtain a zero and/or unzeroed page + * then we may still be able to obtain the other + * type of page. + */ + if (!page) { + page = __rmqueue(zone, order, !zero); + zero = 0; + } spin_unlock_irqrestore(&zone->lock, flags); } if (page != NULL) { BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); + mod_page_state_zone(zone, pgalloc, nr_pages); prep_new_page(page, order); - if (gfp_flags & __GFP_ZERO) + if ((gfp_flags & __GFP_ZERO) && !zero) prep_zero_page(page, order, gfp_flags); if (order && (gfp_flags & __GFP_COMP)) @@ -677,7 +735,7 @@ int zone_watermark_ok(struct zone *z, in return 0; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; + free_pages -= (z->free_area[NOT_ZEROED][o].nr_free + z->free_area[ZEROED][o].nr_free) << o; /* Require fewer higher order pages to be free */ min >>= 1; @@ -1085,7 +1143,7 @@ void __mod_page_state(unsigned offset, u EXPORT_SYMBOL(__mod_page_state); void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) + unsigned long *free, unsigned long *zero, struct pglist_data *pgdat) { struct zone *zones = pgdat->node_zones; int i; @@ -1093,27 +1151,31 @@ void __get_zone_counts(unsigned long *ac *active = 0; *inactive = 0; *free = 0; + *zero = 0; for (i = 0; i < MAX_NR_ZONES; i++) { *active += zones[i].nr_active; *inactive += zones[i].nr_inactive; *free += zones[i].free_pages; + *zero += zones[i].zero_pages; } } void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) + unsigned long *inactive, unsigned long *free, unsigned long *zero) { struct pglist_data *pgdat; *active = 0; *inactive = 0; *free = 0; + *zero = 0; for_each_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); + unsigned long l, m, n,o; + __get_zone_counts(&l, &m, &n, &o, pgdat); *active += l; *inactive += m; *free += n; + *zero += o; } } @@ -1150,6 +1212,7 @@ void si_meminfo_node(struct sysinfo *val #define K(x) ((x) << (PAGE_SHIFT-10)) +const char *temp[3] = { "hot", "cold", "zero" }; /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -1162,6 +1225,7 @@ void show_free_areas(void) unsigned long active; unsigned long inactive; unsigned long free; + unsigned long zero; struct zone *zone; for_each_zone(zone) { @@ -1182,10 +1246,10 @@ void show_free_areas(void) pageset = zone->pageset + cpu; - for (temperature = 0; temperature < 2; temperature++) + for (temperature = 0; temperature < 3; temperature++) printk("cpu %d %s: low %d, high %d, batch %d\n", cpu, - temperature ? "cold" : "hot", + temp[temperature], pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch); @@ -1193,20 +1257,21 @@ void show_free_areas(void) } get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); + get_zone_counts(&active, &inactive, &free, &zero); printk("\nFree pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " - "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + "unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n", active, inactive, ps.nr_dirty, ps.nr_writeback, ps.nr_unstable, nr_free_pages(), + zero, ps.nr_slab, ps.nr_mapped, ps.nr_page_table_pages); @@ -1255,7 +1320,7 @@ void show_free_areas(void) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; + nr = zone->free_area[NOT_ZEROED][order].nr_free + zone->free_area[ZEROED][order].nr_free; total += nr << order; printk("%lu*%lukB ", nr, K(1UL) << order); } @@ -1555,8 +1620,10 @@ void zone_init_free_lists(struct pglist_ { int order; for (order = 0; order < MAX_ORDER ; order++) { - INIT_LIST_HEAD(&zone->free_area[order].free_list); - zone->free_area[order].nr_free = 0; + INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list); + INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list); + zone->free_area[NOT_ZEROED][order].nr_free = 0; + zone->free_area[ZEROED][order].nr_free = 0; } } @@ -1581,6 +1648,7 @@ static void __init free_area_init_core(s pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->kscrubd_wait); pgdat->kswapd_max_order = 0; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -1604,6 +1672,7 @@ static void __init free_area_init_core(s spin_lock_init(&zone->lru_lock); zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->zero_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; @@ -1637,6 +1706,13 @@ static void __init free_area_init_core(s pcp->high = 2 * batch; pcp->batch = 1 * batch; INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[2]; /* zero pages */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); @@ -1762,7 +1838,10 @@ static int frag_show(struct seq_file *m, spin_lock_irqsave(&zone->lock, flags); seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + seq_printf(m, "%6lu ", zone->free_area[NOT_ZEROED][order].nr_free); + seq_printf(m, "\n Zeroed Pages "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[ZEROED][order].nr_free); spin_unlock_irqrestore(&zone->lock, flags); seq_putc(m, '\n'); } Index: linux-2.6.10/mm/Makefile =================================================================== --- linux-2.6.10.orig/mm/Makefile 2005-02-03 22:51:56.000000000 -0800 +++ linux-2.6.10/mm/Makefile 2005-02-07 11:17:27.000000000 -0800 @@ -5,7 +5,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o scrubd.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ Index: linux-2.6.10/mm/scrubd.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.10/mm/scrubd.c 2005-02-07 11:17:27.000000000 -0800 @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int sysctl_scrub_start = 6; /* if a page of this order is coalesed then run kscrubd */ +unsigned int sysctl_scrub_stop = 4; /* Mininum order of page to zero */ +unsigned int sysctl_scrub_load = 999; /* Do not run scrubd if load > */ + +/* + * sysctl handler for /proc/sys/vm/scrub_start + */ +int scrub_start_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (sysctl_scrub_start < MAX_ORDER) { + struct zone *zone; + + for_each_zone(zone) + wakeup_kscrubd(zone); + } + return 0; +} + +LIST_HEAD(zero_drivers); + +/* + * zero_highest_order_page takes a page off the freelist + * and then hands it off to block zeroing agents. + * The cleared pages are added to the back of + * the freelist where the page allocator may pick them up. + */ +int zero_highest_order_page(struct zone *z) +{ + int order; + + for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) { + struct free_area *area = z->free_area[NOT_ZEROED] + order; + if (!list_empty(&area->free_list)) { + struct page *page = scrubd_rmpage(z, area); + struct list_head *l; + int size = PAGE_SIZE << order; + + if (!page) + continue; + + list_for_each(l, &zero_drivers) { + struct zero_driver *driver = list_entry(l, struct zero_driver, list); + + if (driver->start(page_address(page), size) == 0) + goto done; + } + + /* Unable to find a zeroing device that would + * deal with this page so just do it on our own. + * This will likely thrash the cpu caches. + */ + cond_resched(); + prep_zero_page(page, order, 0); +done: + end_zero_page(page, order); + cond_resched(); + return 1 << order; + } + } + return 0; +} + +/* + * scrub_pgdat() will work across all this node's zones. + */ +static void scrub_pgdat(pg_data_t *pgdat) +{ + int i; + unsigned long pages_zeroed; + + if (system_state != SYSTEM_RUNNING) + return; + + do { + pages_zeroed = 0; + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + pages_zeroed += zero_highest_order_page(zone); + } + } while (pages_zeroed); +} + +/* + * The background scrub daemon, started as a kernel thread + * from the init process. + */ +static int kscrubd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + cpumask_t cpumask; + + daemonize("kscrubd%d", pgdat->node_id); + cpumask = node_to_cpumask(pgdat->node_id); + if (!cpus_empty(cpumask)) + set_cpus_allowed(tsk, cpumask); + + tsk->flags |= PF_MEMALLOC | PF_KSCRUBD; + + for ( ; ; ) { + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&pgdat->kscrubd_wait, &wait); + + scrub_pgdat(pgdat); + } + return 0; +} + +static int __init kscrubd_init(void) +{ + pg_data_t *pgdat; + for_each_pgdat(pgdat) + pgdat->kscrubd + = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL)); + return 0; +} + +module_init(kscrubd_init) Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c =================================================================== --- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c 2005-02-03 22:49:33.000000000 -0800 +++ linux-2.6.10/arch/ia64/sn/kernel/setup.c 2005-02-07 11:17:33.000000000 -0800 @@ -251,6 +251,7 @@ void __init sn_setup(char **cmdline_p) int pxm; int major = sn_sal_rev_major(), minor = sn_sal_rev_minor(); extern void sn_cpu_init(void); + extern void sn_bte_bzero_init(void); /* * If the generic code has enabled vga console support - lets @@ -341,6 +342,7 @@ void __init sn_setup(char **cmdline_p) screen_info = sn_screen_info; sn_timer_init(); + sn_bte_bzero_init(); } /** Index: linux-2.6.10/include/linux/scrub.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.10/include/linux/scrub.h 2005-02-07 11:17:27.000000000 -0800 @@ -0,0 +1,49 @@ +#ifndef _LINUX_SCRUB_H +#define _LINUX_SCRUB_H + +/* + * Definitions for scrubbing of memory include an interface + * for drivers that may that allow the zeroing of memory + * without invalidating the caches. + * + * Christoph Lameter, December 2004. + */ + +struct zero_driver { + int (*start)(void *, unsigned long); /* Start bzero transfer */ + struct list_head list; +}; + +extern struct list_head zero_drivers; + +extern unsigned int sysctl_scrub_start; +extern unsigned int sysctl_scrub_stop; +extern unsigned int sysctl_scrub_load; + +/* Registering and unregistering zero drivers */ +static inline void register_zero_driver(struct zero_driver *z) +{ + list_add(&z->list, &zero_drivers); +} + +static inline void unregister_zero_driver(struct zero_driver *z) +{ + list_del(&z->list); +} + +extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area); + +static void inline wakeup_kscrubd(struct zone *zone) +{ + if (avenrun[0] >= ((unsigned long)sysctl_scrub_load << FSHIFT)) + return; + if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait)) + return; + wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait); +} + +int scrub_start_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); + +extern void end_zero_page(struct page *page, unsigned int order); +#endif Index: linux-2.6.10/kernel/sysctl.c =================================================================== --- linux-2.6.10.orig/kernel/sysctl.c 2005-02-03 22:51:56.000000000 -0800 +++ linux-2.6.10/kernel/sysctl.c 2005-02-07 11:17:27.000000000 -0800 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -825,6 +826,33 @@ static ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif + { + .ctl_name = VM_SCRUB_START, + .procname = "scrub_start", + .data = &sysctl_scrub_start, + .maxlen = sizeof(sysctl_scrub_start), + .mode = 0644, + .proc_handler = &scrub_start_handler, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = VM_SCRUB_STOP, + .procname = "scrub_stop", + .data = &sysctl_scrub_stop, + .maxlen = sizeof(sysctl_scrub_stop), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = VM_SCRUB_LOAD, + .procname = "scrub_load", + .data = &sysctl_scrub_load, + .maxlen = sizeof(sysctl_scrub_load), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, { .ctl_name = 0 } }; Index: linux-2.6.10/arch/sparc64/lib/clear_page.S =================================================================== --- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S 2004-12-24 13:35:23.000000000 -0800 +++ linux-2.6.10/arch/sparc64/lib/clear_page.S 2005-02-07 11:17:24.000000000 -0800 @@ -28,9 +28,12 @@ .text .globl _clear_page -_clear_page: /* %o0=dest */ +_clear_page: /* %o0=dest, %o1=order */ + sethi %hi(PAGE_SIZE/64), %o2 + clr %o4 + or %o2, %lo(PAGE_SIZE/64), %o2 ba,pt %xcc, clear_page_common - clr %o4 + sllx %o2, %o1, %o1 /* This thing is pretty important, it shows up * on the profiles via do_anonymous_page(). @@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr flush %g6 wrpr %o4, 0x0, %pstate + sethi %hi(PAGE_SIZE/64), %o1 mov 1, %o4 + or %o1, %lo(PAGE_SIZE/64), %o1 clear_page_common: VISEntryHalf membar #StoreLoad | #StoreStore | #LoadStore fzero %f0 - sethi %hi(PAGE_SIZE/64), %o1 mov %o0, %g1 ! remember vaddr for tlbflush fzero %f2 - or %o1, %lo(PAGE_SIZE/64), %o1 faddd %f0, %f2, %f4 fmuld %f0, %f2, %f6 faddd %f0, %f2, %f8 Index: linux-2.6.10/include/asm-sparc64/page.h =================================================================== --- linux-2.6.10.orig/include/asm-sparc64/page.h 2005-02-03 22:51:43.000000000 -0800 +++ linux-2.6.10/include/asm-sparc64/page.h 2005-02-07 11:17:24.000000000 -0800 @@ -14,8 +14,10 @@ #ifndef __ASSEMBLY__ -extern void _clear_page(void *page); -#define clear_page(X) _clear_page((void *)(X)) +extern void _clear_page(void *page, int order); +#define clear_page(X) _clear_page((void *)(X), 0) +#define clear_pages _clear_page + struct page; extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page); #define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE) Index: linux-2.6.10/include/asm-i386/page.h =================================================================== --- linux-2.6.10.orig/include/asm-i386/page.h 2005-02-03 22:51:34.000000000 -0800 +++ linux-2.6.10/include/asm-i386/page.h 2005-02-07 11:17:24.000000000 -0800 @@ -18,7 +18,7 @@ #include -#define clear_page(page) mmx_clear_page((void *)(page)) +#define clear_pages(page, order) mmx_clear_page((void *)(page),order) #define copy_page(to,from) mmx_copy_page(to,from) #else @@ -28,11 +28,13 @@ * Maybe the K6-III ? */ -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) +#define clear_pages(page, order) memset((void *)(page), 0, PAGE_SIZE << (order)) #define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) #endif +#define __HAVE_ARCH_CLEAR_PAGES +#define clear_page(page) clear_pages(page, 0) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) Index: linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c =================================================================== --- linux-2.6.10.orig/arch/ia64/kernel/ia64_ksyms.c 2005-02-03 22:49:31.000000000 -0800 +++ linux-2.6.10/arch/ia64/kernel/ia64_ksyms.c 2005-02-07 11:17:24.000000000 -0800 @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock); EXPORT_SYMBOL(__up); #include -EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_pages); #ifdef CONFIG_VIRTUAL_MEM_MAP #include Index: linux-2.6.10/include/asm-x86_64/page.h =================================================================== --- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-02-03 22:51:46.000000000 -0800 +++ linux-2.6.10/include/asm-x86_64/page.h 2005-02-07 11:17:24.000000000 -0800 @@ -32,8 +32,10 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -void clear_page(void *); +void clear_pages(void *, int); void copy_page(void *, void *); +#define __HAVE_ARCH_CLEAR_PAGES +#define clear_page(__page) clear_pages(__page, 0) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) Index: linux-2.6.10/arch/x86_64/lib/clear_page.S =================================================================== --- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S 2004-12-24 13:34:33.000000000 -0800 +++ linux-2.6.10/arch/x86_64/lib/clear_page.S 2005-02-07 11:17:24.000000000 -0800 @@ -1,12 +1,16 @@ /* * Zero a page. * rdi page + * rsi order */ - .globl clear_page + .globl clear_pages .p2align 4 -clear_page: +clear_pages: + movl $4096/64,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx xorl %eax,%eax - movl $4096/64,%ecx .p2align 4 .Lloop: decl %ecx @@ -23,7 +27,7 @@ clear_page: jnz .Lloop nop ret -clear_page_end: +clear_pages_end: /* C stepping K8 run faster using the string instructions. It is also a lot simpler. Use this when possible */ @@ -32,19 +36,22 @@ clear_page_end: .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c + .quad clear_pages + .quad clear_pages_c .byte X86_FEATURE_K8_C - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c + .byte clear_pages_end-clear_pages + .byte clear_pages_c_end-clear_pages_c .previous .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx +clear_pages_c: + movl $4096/8,%eax + movl %esi, %ecx + shll %cl, %eax + movl %eax, %ecx xorl %eax,%eax rep stosq ret -clear_page_c_end: +clear_pages_c_end: .previous Index: linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux-2.6.10.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-02-03 22:50:03.000000000 -0800 +++ linux-2.6.10/arch/x86_64/kernel/x8664_ksyms.c 2005-02-07 11:17:24.000000000 -0800 @@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start); #endif EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(clear_pages); EXPORT_SYMBOL(cpu_pda); #ifdef CONFIG_SMP Index: linux-2.6.10/include/linux/sysctl.h =================================================================== --- linux-2.6.10.orig/include/linux/sysctl.h 2005-02-07 10:59:20.000000000 -0800 +++ linux-2.6.10/include/linux/sysctl.h 2005-02-07 11:17:27.000000000 -0800 @@ -169,6 +169,9 @@ enum VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ + VM_SCRUB_START=30, /* percentage * 10 at which to start scrubd */ + VM_SCRUB_STOP=31, /* percentage * 10 at which to stop scrubd */ + VM_SCRUB_LOAD=32, /* Load factor at which not to scrub anymore */ }; Index: linux-2.6.10/arch/ia64/sn/kernel/bte.c =================================================================== --- linux-2.6.10.orig/arch/ia64/sn/kernel/bte.c 2005-02-07 10:59:18.000000000 -0800 +++ linux-2.6.10/arch/ia64/sn/kernel/bte.c 2005-02-07 11:17:33.000000000 -0800 @@ -3,7 +3,9 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * Support for zeroing pages, Christoph Lameter, SGI, December 2004. */ #include @@ -20,6 +22,8 @@ #include #include #include +#include +#include #include @@ -30,7 +34,7 @@ /* two interfaces on two btes */ #define MAX_INTERFACES_TO_TRY 4 -static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface) +static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface) { nodepda_t *tmp_nodepda; @@ -199,6 +203,7 @@ retry_bteop: } while ((transfer_stat = *bte->most_rcnt_na) == BTE_WORD_BUSY) { + cpu_relax(); if (ia64_get_itc() > itc_end) { BTE_PRINTK(("BTE timeout nasid 0x%x bte%d IBLS = 0x%lx na 0x%lx\n", NASID_GET(bte->bte_base_addr), bte->bte_num, @@ -449,5 +454,25 @@ void bte_init_node(nodepda_t * mynodepda mynodepda->bte_if[i].cleanup_active = 0; mynodepda->bte_if[i].bh_error = 0; } +} + +static int bte_start_bzero(void *p, unsigned long len) +{ + + /* Check limitations. + 1. System must be running (weird things happen during bootup) + 2. Size >64KB. Smaller requests cause too much bte traffic + */ + if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING) + return EINVAL; + + return bte_zero(ia64_tpa(p), len, 0, NULL); +} + +static struct zero_driver bte_bzero = { + .start = bte_start_bzero, +}; +void sn_bte_bzero_init(void) { + register_zero_driver(&bte_bzero); } Index: linux-2.6.10/include/linux/mmzone.h =================================================================== --- linux-2.6.10.orig/include/linux/mmzone.h 2005-02-03 22:51:48.000000000 -0800 +++ linux-2.6.10/include/linux/mmzone.h 2005-02-07 11:17:27.000000000 -0800 @@ -51,7 +51,7 @@ struct per_cpu_pages { }; struct per_cpu_pageset { - struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ + struct per_cpu_pages pcp[3]; /* 0: hot. 1: cold 2: cold zeroed pages */ #ifdef CONFIG_NUMA unsigned long numa_hit; /* allocated in intended node */ unsigned long numa_miss; /* allocated in non intended node */ @@ -107,10 +107,14 @@ struct per_cpu_pageset { * ZONE_HIGHMEM > 896 MB only page cache and user processes */ +#define NOT_ZEROED 0 +#define ZEROED 1 + struct zone { /* Fields commonly accessed by the page allocator */ unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; + unsigned long zero_pages; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -127,7 +131,7 @@ struct zone { * free areas of different sizes */ spinlock_t lock; - struct free_area free_area[MAX_ORDER]; + struct free_area free_area[2][MAX_ORDER]; ZONE_PADDING(_pad1_) @@ -262,6 +266,9 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + + wait_queue_head_t kscrubd_wait; + struct task_struct *kscrubd; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -270,9 +277,9 @@ typedef struct pglist_data { extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat); + unsigned long *free, unsigned long *zero, struct pglist_data *pgdat); void get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free); + unsigned long *free, unsigned long *zero); void build_all_zonelists(void); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, Index: linux-2.6.10/mm/readahead.c =================================================================== --- linux-2.6.10.orig/mm/readahead.c 2005-02-03 22:51:57.000000000 -0800 +++ linux-2.6.10/mm/readahead.c 2005-02-07 11:17:27.000000000 -0800 @@ -573,7 +573,8 @@ unsigned long max_sane_readahead(unsigne unsigned long active; unsigned long inactive; unsigned long free; + unsigned long zero; - __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); + __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id())); return min(nr, (inactive + free) / 2); }