File: /usr/src/linux/arch/ia64/sn/sn1/mm.c
1 /*
2 * Copyright, 2000-2001, Silicon Graphics.
3 * Copyright Srinivasa Thirumalachar (sprasad@engr.sgi.com)
4 * Copyright 2000-2001 Kanoj Sarcar (kanoj@sgi.com)
5 */
6
7 #include <linux/config.h>
8 #include <linux/mm.h>
9 #include <linux/bootmem.h>
10 #include <asm/page.h>
11 #include <asm/efi.h>
12 #include <asm/sn/mmzone_sn1.h>
13
14 #define MIN(a,b) ((a) < (b) ? (a) : (b))
15 #define MAX(a,b) ((a) > (b) ? (a) : (b))
16
17 #define DONE_NOTHING 0
18 #define DONE_FINDING 1
19 #define DONE_BUILDING 2
20
21 struct nodemem_s {
22 u64 start; /* start of kernel usable memory */
23 u64 end; /* end of kernel usable memory */
24 u64 mtot; /* total kernel usable memory */
25 u64 done; /* state of bootmem initialization */
26 u64 bstart; /* where should the bootmem area be */
27 u64 bsize; /* bootmap size */
28 u64 hole[SN1_MAX_BANK_PER_NODE];
29 } nodemem[MAXNODES];
30
31 static int nodemem_valid = 0;
32
33 static int __init
34 free_unused_memmap_hole(int nid, unsigned long start, unsigned long end)
35 {
36 struct page * page, *pageend;
37 unsigned long count = 0;
38
39 if (start >= end)
40 return 0;
41
42 /*
43 * Get the memmap ptrs to the start and end of the holes.
44 * virt_to_page(start) will panic, if start is in hole.
45 * Can we do virt_to_page(end), if end is on the next node?
46 */
47
48 page = virt_to_page(start - 1);
49 page++;
50 pageend = virt_to_page(end);
51
52 printk("hpage=0x%lx, hpageend=0x%lx\n", (u64)page, (u64)pageend) ;
53 free_bootmem_node(NODE_DATA(nid), __pa(page), (u64)pageend - (u64)page);
54
55 return count;
56 }
57
58 static void __init
59 free_unused_memmap_node(int nid)
60 {
61 u64 i = 0;
62 u64 holestart = -1;
63 u64 start = nodemem[nid].start;
64
65 start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
66 do {
67 holestart = nodemem[nid].hole[i];
68 i++;
69 while ((i < SN1_MAX_BANK_PER_NODE) &&
70 (nodemem[nid].hole[i] == (u64)-1))
71 i++;
72 if (i < SN1_MAX_BANK_PER_NODE)
73 free_unused_memmap_hole(nid, holestart,
74 start + (i<<SN1_BANK_ADDR_SHIFT));
75 } while (i<SN1_MAX_BANK_PER_NODE);
76 }
77
78 /*
79 * Since efi_memmap_walk merges contiguous banks, this code will need
80 * to find all the nasid/banks covered by the input memory descriptor.
81 */
82 static int __init
83 build_nodemem_map(unsigned long start, unsigned long end, void *arg)
84 {
85 unsigned long vaddr = start;
86 unsigned long nvaddr;
87 int nasid = GetNasId(__pa(vaddr));
88 int cnodeid, bankid;
89
90 while (vaddr < end) {
91 cnodeid = NASID_TO_CNODEID(nasid);
92 bankid = GetBankId(__pa(vaddr));
93 nodemem[cnodeid].start = MIN(nodemem[cnodeid].start, vaddr);
94 nvaddr = (unsigned long)__va((unsigned long)(++nasid) <<
95 SN1_NODE_ADDR_SHIFT);
96 nodemem[cnodeid].end = MAX(nodemem[cnodeid].end, MIN(end, nvaddr));
97 while ((bankid < SN1_MAX_BANK_PER_NODE) &&
98 (vaddr < nodemem[cnodeid].end)) {
99 nvaddr = nodemem[cnodeid].start +
100 ((unsigned long)(bankid + 1) << SN1_BANK_ADDR_SHIFT);
101 nodemem[cnodeid].hole[bankid++] = MIN(nvaddr, end);
102 vaddr = nvaddr;
103 }
104 }
105
106 return 0;
107 }
108
109 static int __init
110 pgtbl_size_ok(int nid)
111 {
112 unsigned long numpfn, bank0size, nodesize ;
113 unsigned long start = nodemem[nid].start;
114
115 start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
116
117 nodesize = nodemem[nid].end - start ;
118 numpfn = nodesize >> PAGE_SHIFT;
119
120 bank0size = nodemem[nid].hole[0] - start ;
121 /* If nid == master node && no kernel text replication */
122 bank0size -= 0xA00000 ; /* Kernel text + stuff */
123 bank0size -= ((numpfn + 7) >> 3);
124
125 if ((numpfn * sizeof(mem_map_t)) > bank0size) {
126 printk("nid = %d, ns=0x%lx, npfn=0x%lx, bank0size=0x%lx\n",
127 nid, nodesize, numpfn, bank0size) ;
128 return 0 ;
129 }
130
131 return 1 ;
132 }
133
134 static void __init
135 check_pgtbl_size(int nid)
136 {
137 int bank = SN1_MAX_BANK_PER_NODE - 1 ;
138
139 /* Find highest bank with valid memory */
140 while ((nodemem[nid].hole[bank] == -1) && (bank))
141 bank-- ;
142
143 while (!pgtbl_size_ok(nid)) {
144 /* Remove that bank of memory */
145 /* Collect some numbers later */
146 printk("Ignoring node %d bank %d\n", nid, bank) ;
147 nodemem[nid].hole[bank--] = -1 ;
148 /* Get to the next populated bank */
149 while ((nodemem[nid].hole[bank] == -1) && (bank))
150 bank-- ;
151 printk("Using only upto bank %d on node %d\n", bank,nid) ;
152 nodemem[nid].end = nodemem[nid].hole[bank] ;
153 if (!bank) break ;
154 }
155 }
156
157 void dump_nodemem_map(int) ;
158
159 #ifdef CONFIG_DISCONTIGMEM
160
161 extern bootmem_data_t bdata[];
162
163 /*
164 * This assumes there will be a hole in kernel-usable memory between nodes
165 * (due to prom). The memory descriptors invoked via efi_memmap_walk are
166 * in increasing order. It tries to identify first suitable free area to
167 * put the bootmem for the node in. When presented with the md holding
168 * the kernel, it only searches at the end of the kernel area.
169 */
170 static int __init
171 find_node_bootmem(unsigned long start, unsigned long end, void *arg)
172 {
173 int nasid = GetNasId(__pa(start));
174 int cnodeid = NASID_TO_CNODEID(nasid);
175 unsigned long nodesize;
176 extern char _end;
177 unsigned long kaddr = (unsigned long)&_end;
178
179 /*
180 * Track memory available to kernel.
181 */
182 nodemem[cnodeid].mtot += ((end - start) >> PAGE_SHIFT);
183 if (nodemem[cnodeid].done != DONE_NOTHING)
184 return(0);
185 nodesize = nodemem[cnodeid].end - ((nodemem[cnodeid].start >>
186 SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
187 nodesize >>= PAGE_SHIFT;
188
189 /*
190 * Adjust limits for the md holding the kernel.
191 */
192 if ((start < kaddr) && (end > kaddr))
193 start = PAGE_ALIGN(kaddr);
194
195 /*
196 * We need space for mem_map, bootmem map plus a few more pages
197 * to satisfy alloc_bootmems out of node 0.
198 */
199 if ((end - start) > ((nodesize * sizeof(struct page)) + (nodesize/8)
200 + (10 * PAGE_SIZE))) {
201 nodemem[cnodeid].bstart = start;
202 nodemem[cnodeid].done = DONE_FINDING;
203 }
204 return(0);
205 }
206
207 /*
208 * This assumes there will be a hole in kernel-usable memory between nodes
209 * (due to prom). The memory descriptors invoked via efi_memmap_walk are
210 * in increasing order.
211 */
212 static int __init
213 build_node_bootmem(unsigned long start, unsigned long end, void *arg)
214 {
215 int nasid = GetNasId(__pa(start));
216 int curnodeid = NASID_TO_CNODEID(nasid);
217 int i;
218 unsigned long pstart, pend;
219 extern char _end, _stext;
220 unsigned long kaddr = (unsigned long)&_end;
221
222 if (nodemem[curnodeid].done == DONE_FINDING) {
223 /*
224 * This is where we come to know the node is present.
225 * Do node wide tasks.
226 */
227 nodemem[curnodeid].done = DONE_BUILDING;
228 NODE_DATA(curnodeid)->bdata = &(bdata[curnodeid]);
229
230 /*
231 * Update the chunktonid array as a node wide task. There
232 * are too many smalls mds on first node to do this per md.
233 */
234 pstart = __pa(nodemem[curnodeid].start);
235 pend = __pa(nodemem[curnodeid].end);
236 pstart &= CHUNKMASK;
237 pend = (pend + CHUNKSZ - 1) & CHUNKMASK;
238 /* Possible check point to enforce minimum node size */
239 if (nodemem[curnodeid].bstart == -1) {
240 printk("No valid bootmem area on node %d\n", curnodeid);
241 while(1);
242 }
243 for (i = PCHUNKNUM(pstart); i <= PCHUNKNUM(pend - 1); i++)
244 chunktonid[i] = curnodeid;
245 if ((CHUNKTONID(PCHUNKNUM(pend)) > MAXCHUNKS) ||
246 (PCHUNKNUM(pstart) >= PCHUNKNUM(pend))) {
247 printk("Ign 0x%lx-0x%lx, ", __pa(start), __pa(end));
248 return(0);
249 }
250
251 /*
252 * NODE_START and NODE_SIZE determine the physical range
253 * on the node that mem_map array needs to be set up for.
254 */
255 NODE_START(curnodeid) = ((nodemem[curnodeid].start >>
256 SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
257 NODE_SIZE(curnodeid) = (nodemem[curnodeid].end -
258 NODE_START(curnodeid));
259
260 nodemem[curnodeid].bsize =
261 init_bootmem_node(NODE_DATA(curnodeid),
262 (__pa(nodemem[curnodeid].bstart) >> PAGE_SHIFT),
263 (__pa((nodemem[curnodeid].start >> SN1_NODE_ADDR_SHIFT)
264 << SN1_NODE_ADDR_SHIFT) >> PAGE_SHIFT),
265 (__pa(nodemem[curnodeid].end) >> PAGE_SHIFT));
266
267 } else if (nodemem[curnodeid].done == DONE_NOTHING) {
268 printk("build_node_bootmem: node %d weirdness\n", curnodeid);
269 while(1); /* Paranoia */
270 }
271
272 /*
273 * Free the entire md.
274 */
275 free_bootmem_node(NODE_DATA(curnodeid), __pa(start), (end - start));
276
277 /*
278 * Reclaim back the bootmap and kernel areas.
279 */
280 if ((start <= nodemem[curnodeid].bstart) && (end >
281 nodemem[curnodeid].bstart))
282 reserve_bootmem_node(NODE_DATA(curnodeid),
283 __pa(nodemem[curnodeid].bstart), nodemem[curnodeid].bsize);
284 if ((start <= kaddr) && (end > kaddr))
285 reserve_bootmem_node(NODE_DATA(curnodeid),
286 __pa(&_stext), (&_end - &_stext));
287
288 return(0);
289 }
290
291 void __init
292 setup_sn1_bootmem(int maxnodes)
293 {
294 int i;
295
296 for (i = 0; i < MAXNODES; i++) {
297 nodemem[i].start = nodemem[i].bstart = -1;
298 nodemem[i].end = nodemem[i].bsize = nodemem[i].mtot = 0;
299 nodemem[i].done = DONE_NOTHING;
300 memset(&nodemem[i].hole, -1, sizeof(nodemem[i].hole));
301 }
302 efi_memmap_walk(build_nodemem_map, 0);
303
304 nodemem_valid = 1;
305
306 /*
307 * After building the nodemem map, check if the node memmap
308 * will fit in the first bank of each node. If not change
309 * the node end addr till it fits.
310 */
311
312 for (i = 0; i < maxnodes; i++)
313 check_pgtbl_size(i);
314
315 dump_nodemem_map(maxnodes);
316
317 efi_memmap_walk(find_node_bootmem, 0);
318 efi_memmap_walk(build_node_bootmem, 0);
319 }
320 #endif
321
322 void __init
323 discontig_paging_init(void)
324 {
325 int i;
326 unsigned long max_dma, zones_size[MAX_NR_ZONES], holes_size[MAX_NR_ZONES];
327 extern void dump_node_data(void);
328
329 max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
330 for (i = 0; i < numnodes; i++) {
331 unsigned long startpfn = __pa((void *)NODE_START(i)) >> PAGE_SHIFT;
332 unsigned long numpfn = NODE_SIZE(i) >> PAGE_SHIFT;
333 memset(zones_size, 0, sizeof(zones_size));
334 memset(holes_size, 0, sizeof(holes_size));
335 holes_size[ZONE_DMA] = numpfn - nodemem[i].mtot;
336
337 if ((startpfn + numpfn) < max_dma) {
338 zones_size[ZONE_DMA] = numpfn;
339 } else if (startpfn > max_dma) {
340 zones_size[ZONE_NORMAL] = numpfn;
341 panic("discontig_paging_init: %d\n", i);
342 } else {
343 zones_size[ZONE_DMA] = (max_dma - startpfn);
344 zones_size[ZONE_NORMAL] = numpfn - zones_size[ZONE_DMA];
345 panic("discontig_paging_init: %d\n", i);
346 }
347 free_area_init_node(i, NODE_DATA(i), NULL, zones_size, startpfn<<PAGE_SHIFT, holes_size);
348 free_unused_memmap_node(i);
349 }
350 dump_node_data();
351 }
352
353 /*
354 * This used to be invoked from an SN1 specific hack in efi_memmap_walk.
355 * It tries to ignore banks which the kernel is ignoring because bank 0
356 * is too small to hold the memmap entries for this bank.
357 * The current SN1 efi_memmap_walk callbacks do not need this. That
358 * leaves the generic ia64 callbacks find_max_pfn, count_pages and
359 * count_reserved_pages, of which the first can probably get by without
360 * this, the last two probably need this, although they also can probably
361 * get by.
362 */
363 int
364 sn1_bank_ignore(u64 start, u64 end)
365 {
366 int nid = NASID_TO_CNODEID(GetNasId(__pa(end))) ;
367 int bank = GetBankId(__pa(end)) ;
368
369 if (!nodemem_valid)
370 return 0 ;
371
372 if (nodemem[nid].hole[bank] == -1)
373 return 1 ;
374 else
375 return 0 ;
376 }
377
378 void
379 dump_nodemem_map(int maxnodes)
380 {
381 int i,j;
382
383 printk("NODEMEM_S info ....\n") ;
384 printk("Node start end\n");
385 for (i=0;i<maxnodes;i++) {
386 printk("%d 0x%lx 0x%lx\n",
387 i, nodemem[i].start, nodemem[i].end);
388 printk("Holes -> ") ;
389 for (j=0;j<SN1_MAX_BANK_PER_NODE;j++)
390 printk("0x%lx ", nodemem[i].hole[j]) ;
391 printk("\n");
392 }
393 }
394
395