File: /usr/src/linux/arch/ia64/sn/sn1/mm.c

1     /*
2      * Copyright, 2000-2001, Silicon Graphics.
3      * Copyright Srinivasa Thirumalachar (sprasad@engr.sgi.com)
4      * Copyright 2000-2001 Kanoj Sarcar (kanoj@sgi.com)
5      */
6     
7     #include <linux/config.h>
8     #include <linux/mm.h>
9     #include <linux/bootmem.h>
10     #include <asm/page.h>
11     #include <asm/efi.h>
12     #include <asm/sn/mmzone_sn1.h>
13     
14     #define MIN(a,b)	((a) < (b) ? (a) : (b))
15     #define MAX(a,b)	((a) > (b) ? (a) : (b))
16     
17     #define DONE_NOTHING	0
18     #define DONE_FINDING	1
19     #define DONE_BUILDING	2
20     
21     struct nodemem_s {
22             u64     start;	/* start of kernel usable memory */
23             u64     end;	/* end of kernel usable memory */
24     	u64	mtot;	/* total kernel usable memory */
25     	u64	done;	/* state of bootmem initialization */
26     	u64	bstart;	/* where should the bootmem area be */
27     	u64	bsize;	/* bootmap size */
28             u64 	hole[SN1_MAX_BANK_PER_NODE];
29     } nodemem[MAXNODES];
30     
31     static int nodemem_valid = 0;
32     
33     static int __init
34     free_unused_memmap_hole(int nid, unsigned long start, unsigned long end)
35     {
36             struct page * page, *pageend;
37             unsigned long count = 0;
38     
39     	if (start >= end)
40     		return 0;
41     
42     	/*
43     	 * Get the memmap ptrs to the start and end of the holes.
44     	 * virt_to_page(start) will panic, if start is in hole.
45     	 * Can we do virt_to_page(end), if end is on the next node?
46     	 */
47     
48     	page = virt_to_page(start - 1);
49     	page++;
50     	pageend = virt_to_page(end);
51     
52     	printk("hpage=0x%lx, hpageend=0x%lx\n", (u64)page, (u64)pageend) ;
53     	free_bootmem_node(NODE_DATA(nid), __pa(page), (u64)pageend - (u64)page);
54     
55     	return count;
56     }
57     
58     static void __init
59     free_unused_memmap_node(int nid)
60     {
61     	u64	i = 0;
62     	u64	holestart = -1;
63     	u64	start = nodemem[nid].start;
64     
65     	start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
66     	do {
67     		holestart = nodemem[nid].hole[i];
68     		i++;
69     		while ((i < SN1_MAX_BANK_PER_NODE) && 
70     					(nodemem[nid].hole[i] == (u64)-1))
71     			i++;
72     		if (i < SN1_MAX_BANK_PER_NODE)
73     			free_unused_memmap_hole(nid, holestart, 
74     				start + (i<<SN1_BANK_ADDR_SHIFT));
75     	} while (i<SN1_MAX_BANK_PER_NODE);
76     }
77     
78     /*
79      * Since efi_memmap_walk merges contiguous banks, this code will need
80      * to find all the nasid/banks covered by the input memory descriptor.
81      */
82     static int __init
83     build_nodemem_map(unsigned long start, unsigned long end, void *arg)
84     {
85     	unsigned long vaddr = start;
86     	unsigned long nvaddr;
87     	int nasid = GetNasId(__pa(vaddr));
88     	int cnodeid, bankid;
89     
90     	while (vaddr < end) {
91     		cnodeid = NASID_TO_CNODEID(nasid);
92     		bankid = GetBankId(__pa(vaddr));
93     		nodemem[cnodeid].start = MIN(nodemem[cnodeid].start, vaddr);
94     		nvaddr = (unsigned long)__va((unsigned long)(++nasid) << 
95     							SN1_NODE_ADDR_SHIFT);
96     		nodemem[cnodeid].end = MAX(nodemem[cnodeid].end, MIN(end, nvaddr));
97     		while ((bankid < SN1_MAX_BANK_PER_NODE) && 
98     					(vaddr < nodemem[cnodeid].end)) {
99     			nvaddr = nodemem[cnodeid].start + 
100     			  ((unsigned long)(bankid + 1) << SN1_BANK_ADDR_SHIFT);
101     			nodemem[cnodeid].hole[bankid++] = MIN(nvaddr, end);
102     			vaddr = nvaddr;
103     		}
104     	}
105     
106     	return 0;
107     }
108     
109     static int __init
110     pgtbl_size_ok(int nid)
111     {
112     	unsigned long numpfn, bank0size, nodesize ;
113     	unsigned long start = nodemem[nid].start;
114     
115     	start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
116     	
117     	nodesize 	= nodemem[nid].end - start ;
118     	numpfn 		= nodesize >> PAGE_SHIFT;
119     
120     	bank0size 	= nodemem[nid].hole[0] - start ;
121     	/* If nid == master node && no kernel text replication */
122     	bank0size      -= 0xA00000 ;	/* Kernel text + stuff */
123     	bank0size      -= ((numpfn + 7) >> 3);
124     
125     	if ((numpfn * sizeof(mem_map_t)) > bank0size) {
126     		printk("nid = %d, ns=0x%lx, npfn=0x%lx, bank0size=0x%lx\n", 
127     			nid, nodesize, numpfn, bank0size) ;
128     		return 0 ;
129     	}
130     
131     	return 1 ;
132     }
133     
134     static void __init
135     check_pgtbl_size(int nid)
136     {
137     	int	bank = SN1_MAX_BANK_PER_NODE - 1 ;
138     
139     	/* Find highest bank with valid memory */
140             while ((nodemem[nid].hole[bank] == -1) && (bank))
141                    bank-- ;
142     
143     	while (!pgtbl_size_ok(nid)) {
144     		/* Remove that bank of memory */
145     		/* Collect some numbers later */
146     		printk("Ignoring node %d bank %d\n", nid, bank) ;
147     		nodemem[nid].hole[bank--] = -1 ;
148     		/* Get to the next populated bank */
149     		while ((nodemem[nid].hole[bank] == -1) && (bank))
150     			bank-- ;
151     		printk("Using only upto bank %d on node %d\n", bank,nid) ;
152     		nodemem[nid].end = nodemem[nid].hole[bank] ; 
153     		if (!bank) break ;
154     	}
155     }
156     
157     void dump_nodemem_map(int) ;
158     
159     #ifdef CONFIG_DISCONTIGMEM
160     
161     extern bootmem_data_t bdata[];
162     
163     /*
164      * This assumes there will be a hole in kernel-usable memory between nodes
165      * (due to prom). The memory descriptors invoked via efi_memmap_walk are 
166      * in increasing order. It tries to identify first suitable free area to 
167      * put the bootmem for the node in. When presented with the md holding
168      * the kernel, it only searches at the end of the kernel area.
169      */
170     static int __init
171     find_node_bootmem(unsigned long start, unsigned long end, void *arg)
172     {
173     	int nasid = GetNasId(__pa(start));
174     	int cnodeid = NASID_TO_CNODEID(nasid);
175     	unsigned long nodesize;
176     	extern char _end;
177     	unsigned long kaddr = (unsigned long)&_end;
178     
179     	/*
180     	 * Track memory available to kernel.
181     	 */
182     	nodemem[cnodeid].mtot += ((end - start) >> PAGE_SHIFT);
183     	if (nodemem[cnodeid].done != DONE_NOTHING)
184     		return(0);
185     	nodesize = nodemem[cnodeid].end - ((nodemem[cnodeid].start >> 
186     				SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
187     	nodesize >>= PAGE_SHIFT;
188     
189     	/*
190     	 * Adjust limits for the md holding the kernel.
191     	 */
192     	if ((start < kaddr) && (end > kaddr))
193     		start = PAGE_ALIGN(kaddr);
194     
195     	/*
196     	 * We need space for mem_map, bootmem map plus a few more pages
197     	 * to satisfy alloc_bootmems out of node 0.
198     	 */
199     	if ((end - start) > ((nodesize * sizeof(struct page)) + (nodesize/8)
200     						+ (10 * PAGE_SIZE))) {
201     		nodemem[cnodeid].bstart = start;
202     		nodemem[cnodeid].done = DONE_FINDING;
203     	}
204     	return(0);
205     }
206     
207     /*
208      * This assumes there will be a hole in kernel-usable memory between nodes
209      * (due to prom). The memory descriptors invoked via efi_memmap_walk are 
210      * in increasing order.
211      */
212     static int __init
213     build_node_bootmem(unsigned long start, unsigned long end, void *arg)
214     {
215     	int nasid = GetNasId(__pa(start));
216     	int curnodeid = NASID_TO_CNODEID(nasid);
217     	int i;
218     	unsigned long pstart, pend;
219     	extern char _end, _stext;
220     	unsigned long kaddr = (unsigned long)&_end;
221     
222     	if (nodemem[curnodeid].done == DONE_FINDING) {
223     		/*
224     		 * This is where we come to know the node is present.
225     		 * Do node wide tasks.
226     		 */
227     		nodemem[curnodeid].done = DONE_BUILDING;
228     		NODE_DATA(curnodeid)->bdata = &(bdata[curnodeid]);
229     
230     		/*
231     	 	 * Update the chunktonid array as a node wide task. There
232     		 * are too many smalls mds on first node to do this per md.
233     	 	 */
234     		pstart = __pa(nodemem[curnodeid].start);
235     		pend = __pa(nodemem[curnodeid].end);
236     		pstart &= CHUNKMASK;
237     		pend = (pend + CHUNKSZ - 1) & CHUNKMASK;
238     		/* Possible check point to enforce minimum node size */
239     		if (nodemem[curnodeid].bstart == -1) {
240     			printk("No valid bootmem area on node %d\n", curnodeid);
241     			while(1);
242     		}
243     		for (i = PCHUNKNUM(pstart); i <= PCHUNKNUM(pend - 1); i++)
244     			chunktonid[i] = curnodeid;
245     		if ((CHUNKTONID(PCHUNKNUM(pend)) > MAXCHUNKS) || 
246     				(PCHUNKNUM(pstart) >= PCHUNKNUM(pend))) {
247     			printk("Ign 0x%lx-0x%lx, ", __pa(start), __pa(end));
248     			return(0);
249     		}
250     
251     		/*
252     		 * NODE_START and NODE_SIZE determine the physical range
253     		 * on the node that mem_map array needs to be set up for.
254     		 */
255     		NODE_START(curnodeid) = ((nodemem[curnodeid].start >> 
256     				SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
257     		NODE_SIZE(curnodeid) = (nodemem[curnodeid].end - 
258     							NODE_START(curnodeid));
259     
260             	nodemem[curnodeid].bsize = 
261     			init_bootmem_node(NODE_DATA(curnodeid),
262     			(__pa(nodemem[curnodeid].bstart) >> PAGE_SHIFT),
263     			(__pa((nodemem[curnodeid].start >> SN1_NODE_ADDR_SHIFT)
264     			<< SN1_NODE_ADDR_SHIFT) >> PAGE_SHIFT),
265     			(__pa(nodemem[curnodeid].end) >> PAGE_SHIFT));
266     
267     	} else if (nodemem[curnodeid].done == DONE_NOTHING) {
268     		printk("build_node_bootmem: node %d weirdness\n", curnodeid);
269     		while(1);		/* Paranoia */
270     	}
271     
272     	/*
273     	 * Free the entire md.
274     	 */
275     	free_bootmem_node(NODE_DATA(curnodeid), __pa(start), (end - start));
276     
277     	/*
278     	 * Reclaim back the bootmap and kernel areas.
279     	 */
280     	if ((start <= nodemem[curnodeid].bstart) && (end >
281     						nodemem[curnodeid].bstart))
282     		reserve_bootmem_node(NODE_DATA(curnodeid),
283     		    __pa(nodemem[curnodeid].bstart), nodemem[curnodeid].bsize);
284     	if ((start <= kaddr) && (end > kaddr))
285     		reserve_bootmem_node(NODE_DATA(curnodeid),
286     		    __pa(&_stext), (&_end - &_stext));
287     
288     	return(0);
289     }
290     
291     void __init
292     setup_sn1_bootmem(int maxnodes)
293     {
294             int     i;
295     
296             for (i = 0; i < MAXNODES; i++) {
297                     nodemem[i].start = nodemem[i].bstart = -1;
298                     nodemem[i].end = nodemem[i].bsize = nodemem[i].mtot = 0;
299     		nodemem[i].done = DONE_NOTHING;
300     		memset(&nodemem[i].hole, -1, sizeof(nodemem[i].hole));
301             }
302             efi_memmap_walk(build_nodemem_map, 0);
303     
304     	nodemem_valid = 1;
305     
306     	/* 
307     	 * After building the nodemem map, check if the node memmap
308     	 * will fit in the first bank of each node. If not change
309     	 * the node end addr till it fits.
310      	 */
311     
312             for (i = 0; i < maxnodes; i++)
313     		check_pgtbl_size(i);
314     
315     	dump_nodemem_map(maxnodes);
316     
317     	efi_memmap_walk(find_node_bootmem, 0);
318     	efi_memmap_walk(build_node_bootmem, 0);
319     }
320     #endif
321     
322     void __init
323     discontig_paging_init(void)
324     {
325     	int i;
326     	unsigned long max_dma, zones_size[MAX_NR_ZONES], holes_size[MAX_NR_ZONES];
327     	extern void dump_node_data(void);
328     
329     	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
330     	for (i = 0; i < numnodes; i++) {
331     		unsigned long startpfn = __pa((void *)NODE_START(i)) >> PAGE_SHIFT;
332     		unsigned long numpfn = NODE_SIZE(i) >> PAGE_SHIFT;
333     		memset(zones_size, 0, sizeof(zones_size));
334     		memset(holes_size, 0, sizeof(holes_size));
335     		holes_size[ZONE_DMA] = numpfn - nodemem[i].mtot;
336     
337     		if ((startpfn + numpfn) < max_dma) {
338     			zones_size[ZONE_DMA] = numpfn;
339     		} else if (startpfn > max_dma) {
340     			zones_size[ZONE_NORMAL] = numpfn;
341     			panic("discontig_paging_init: %d\n", i);
342     		} else {
343     			zones_size[ZONE_DMA] = (max_dma - startpfn);
344     			zones_size[ZONE_NORMAL] = numpfn - zones_size[ZONE_DMA];
345     			panic("discontig_paging_init: %d\n", i);
346     		}
347     		free_area_init_node(i, NODE_DATA(i), NULL, zones_size, startpfn<<PAGE_SHIFT, holes_size);
348     		free_unused_memmap_node(i);
349     	}
350     	dump_node_data();
351     }
352     
353     /*
354      * This used to be invoked from an SN1 specific hack in efi_memmap_walk.
355      * It tries to ignore banks which the kernel is ignoring because bank 0 
356      * is too small to hold the memmap entries for this bank.
357      * The current SN1 efi_memmap_walk callbacks do not need this. That 
358      * leaves the generic ia64 callbacks find_max_pfn, count_pages and
359      * count_reserved_pages, of which the first can probably get by without
360      * this, the last two probably need this, although they also can probably
361      * get by. 
362      */
363     int
364     sn1_bank_ignore(u64 start, u64 end)
365     {
366     	int 	nid = NASID_TO_CNODEID(GetNasId(__pa(end))) ;
367     	int	bank = GetBankId(__pa(end)) ;
368     
369     	if (!nodemem_valid)
370     		return 0 ;
371     
372     	if (nodemem[nid].hole[bank] == -1)
373     		return 1 ;
374     	else
375     		return 0 ;
376     }
377     
378     void
379     dump_nodemem_map(int maxnodes)
380     {
381     	int	i,j;
382     
383             printk("NODEMEM_S info ....\n") ;
384             printk("Node         start                end\n");
385             for (i=0;i<maxnodes;i++) {
386                     printk("%d      0x%lx   0x%lx\n",
387                            i, nodemem[i].start, nodemem[i].end);
388                     printk("Holes -> ") ;
389                     for (j=0;j<SN1_MAX_BANK_PER_NODE;j++)
390                             printk("0x%lx ", nodemem[i].hole[j]) ;
391     		printk("\n");
392             }
393     }
394     
395