File: /usr/src/linux/arch/ia64/sn/io/huberror.c
1 /* $Id$
2 *
3 * This file is subject to the terms and conditions of the GNU General Public
4 * License. See the file "COPYING" in the main directory of this archive
5 * for more details.
6 *
7 * Copyright (C) 1992 - 1997, 2000 Silicon Graphics, Inc.
8 * Copyright (C) 2000 by Alan Mayer
9 */
10
11
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <asm/smp.h>
15 #include <asm/sn/sgi.h>
16 #include <asm/sn/iograph.h>
17 #include <asm/sn/invent.h>
18 #include <asm/sn/hcl.h>
19 #include <asm/sn/labelcl.h>
20 #include <asm/sn/nodemask.h>
21 #include <asm/sn/sn_private.h>
22 #include <asm/sn/klconfig.h>
23 #include <asm/sn/synergy.h>
24 #include <asm/sn/sn_cpuid.h>
25 #include <asm/sn/pci/pciio.h>
26 #include <asm/sn/pci/pcibr.h>
27 #include <asm/sn/xtalk/xtalk.h>
28 #include <asm/sn/pci/pcibr_private.h>
29 #include <asm/sn/intr.h>
30
31 extern void hubni_eint_init(cnodeid_t cnode);
32 extern void hubii_eint_init(cnodeid_t cnode);
33 extern void hubii_eint_handler (int irq, void *arg, struct pt_regs *ep);
34 extern void snia_error_intr_handler(int irq, void *devid, struct pt_regs *pt_regs);
35
36 extern int maxcpus;
37
38 #define HUB_ERROR_PERIOD (120 * HZ) /* 2 minutes */
39
40
41 void
42 hub_error_clear(nasid_t nasid)
43 {
44 int i;
45 hubreg_t idsr;
46 int sn;
47
48 for(sn=0; sn<NUM_SUBNODES; sn++) {
49 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_PEND, -1);
50 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS0_A_CLR, -1);
51 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS0_B_CLR, -1);
52 REMOTE_HUB_PI_S(nasid, sn, PI_SPURIOUS_HDR_0, 0);
53 REMOTE_HUB_PI_S(nasid, sn, PI_SPURIOUS_HDR_1, 0);
54 }
55
56 REMOTE_HUB_L(nasid, MD_DIR_ERROR_CLR);
57 REMOTE_HUB_L(nasid, MD_MEM_ERROR_CLR);
58 REMOTE_HUB_L(nasid, MD_MISC1_ERROR_CLR);
59 REMOTE_HUB_L(nasid, MD_PROTOCOL_ERR_CLR);
60
61 /*
62 * Make sure spurious write response errors are cleared
63 * (values are from hub_set_prb())
64 */
65 for (i = 0; i <= HUB_WIDGET_ID_MAX - HUB_WIDGET_ID_MIN + 1; i++) {
66 iprb_t prb;
67
68 prb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)));
69
70 /* Clear out some fields */
71 prb.iprb_ovflow = 1;
72 prb.iprb_bnakctr = 0;
73 prb.iprb_anakctr = 0;
74
75 /*
76 * PIO reads in fire-and-forget mode on bedrock 1.0 don't
77 * frob the credit count properly, making the responses appear
78 * spurious. So don't use fire-and-forget mode. Bug 761802.
79 */
80 prb.iprb_ff = 0; /* disable fire-and-forget mode by default */
81
82 prb.iprb_xtalkctr = 3; /* approx. PIO credits for the widget */
83
84 REMOTE_HUB_S(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)), prb.iprb_regval);
85 }
86
87 REMOTE_HUB_S(nasid, IIO_IO_ERR_CLR, -1);
88 idsr = REMOTE_HUB_L(nasid, IIO_IIDSR);
89 REMOTE_HUB_S(nasid, IIO_IIDSR, (idsr & ~(IIO_IIDSR_SENT_MASK)));
90
91 REMOTE_HUB_L(nasid, NI_PORT_ERROR_CLEAR);
92 /* No need to clear NI_PORT_HEADER regs; they are continually overwritten*/
93
94 REMOTE_HUB_S(nasid, LB_ERROR_MASK_CLR, -1);
95 REMOTE_HUB_S(nasid, LB_ERROR_HDR1, 0);
96
97 /* Clear XB error regs, in order */
98 for (i = 0;
99 i <= XB_FIRST_ERROR_CLEAR - XB_POQ0_ERROR_CLEAR;
100 i += sizeof(hubreg_t)) {
101 REMOTE_HUB_S(nasid, XB_POQ0_ERROR_CLEAR + i, 0);
102 }
103 }
104
105
106 /*
107 * Function : hub_error_init
108 * Purpose : initialize the error handling requirements for a given hub.
109 * Parameters : cnode, the compact nodeid.
110 * Assumptions : Called only once per hub, either by a local cpu. Or by a
111 * remote cpu, when this hub is headless.(cpuless)
112 * Returns : None
113 */
114
115 void
116 hub_error_init(cnodeid_t cnode)
117 {
118 nasid_t nasid;
119
120 nasid = cnodeid_to_nasid(cnode);
121 hub_error_clear(nasid);
122
123 #ifdef ajm
124 if (cnode == 0) {
125 /*
126 * Allocate log for storing the node specific error info
127 */
128 for (i = 0; i < numnodes; i++) {
129 kl_error_log[i] = kmem_zalloc_node(sizeof(sn0_error_log_t),
130 KM_NOSLEEP, i);
131 hub_err_count[i] = kmem_zalloc_node(sizeof(hub_errcnt_t),
132 VM_DIRECT | KM_NOSLEEP, i);
133 ASSERT_ALWAYS(kl_error_log[i] && hub_err_count[i]);
134 }
135 }
136
137 /*
138 * Assumption: There will be only one cpu who will initialize
139 * a hub. we need to setup the ii and each pi error interrupts.
140 * The SN1 hub (bedrock) has two PI, one for up to two processors.
141 */
142
143 if (cpuid_to_cnodeid(smp_processor_id()) == cnode) {
144 int generic_intr_mask = PI_ERR_GENERIC; /* These interrupts are sent to only 1 CPU per NODE */
145
146 ASSERT_ALWAYS(kl_error_log[cnode]);
147 ASSERT_ALWAYS(hub_err_count[cnode]);
148 MD_ERR_LOG_INIT(kl_error_log[cnode]);
149
150 /* One for each CPU */
151 recover_error_init(RECOVER_ERROR_TABLE(cnode, 0));
152 recover_error_init(RECOVER_ERROR_TABLE(cnode, 1));
153 recover_error_init(RECOVER_ERROR_TABLE(cnode, 2));
154 recover_error_init(RECOVER_ERROR_TABLE(cnode, 3));
155
156 /*
157 * Setup error intr masks.
158 */
159 for(sn=0; sn<NUM_SUBNODES; sn++) {
160 int cpuA_present = REMOTE_HUB_PI_L(nasid, sn, PI_CPU_ENABLE_A);
161 int cpuB_present = REMOTE_HUB_PI_L(nasid, sn, PI_CPU_ENABLE_B);
162
163 if (cpuA_present) {
164 if (cpuB_present) { /* A && B */
165 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A,
166 (PI_FATAL_ERR_CPU_B | PI_MISC_ERR_CPU_A|generic_intr_mask));
167 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B,
168 (PI_FATAL_ERR_CPU_A | PI_MISC_ERR_CPU_B));
169
170 } else { /* A && !B */
171 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A,
172 (PI_FATAL_ERR_CPU_A | PI_MISC_ERR_CPU_A|generic_intr_mask));
173 }
174 generic_intr_mask = 0;
175 } else {
176 if (cpuB_present) { /* !A && B */
177 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B,
178 (PI_FATAL_ERR_CPU_B | PI_MISC_ERR_CPU_B|generic_intr_mask));
179 generic_intr_mask = 0;
180
181 } else { /* !A && !B */
182 /* nothing to set up */
183 }
184 }
185 }
186
187 /*
188 * Turn off UNCAC_UNCORR interrupt in the masks. Anyone interested
189 * in these errors will peek at the int pend register to see if its
190 * set.
191 */
192 for(sn=0; sn<NUM_SUBNODES; sn++) {
193 misc = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_INT_MASK_A);
194 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A, (misc & ~PI_ERR_UNCAC_UNCORR_A));
195 misc = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_INT_MASK_B);
196 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B, (misc & ~PI_ERR_UNCAC_UNCORR_B));
197 }
198
199 /*
200 * enable all error indicators to turn on, in case of errors.
201 *
202 * This is not good on single cpu node boards.
203 **** LOCAL_HUB_S(PI_SYSAD_ERRCHK_EN, PI_SYSAD_CHECK_ALL);
204 */
205 for(sn=0; sn<NUM_SUBNODES; sn++) {
206 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS1_A_CLR, 0);
207 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS1_B_CLR, 0);
208 }
209
210 /* Set up stack for each present processor */
211 for(sn=0; sn<NUM_SUBNODES; sn++) {
212 if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_A)) {
213 SN0_ERROR_LOG(cnode)->el_spool_cur_addr[0] =
214 SN0_ERROR_LOG(cnode)->el_spool_last_addr[0] =
215 REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_A);
216 }
217
218 if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_B)) {
219 SN0_ERROR_LOG(cnode)->el_spool_cur_addr[1] =
220 SN0_ERROR_LOG(cnode)->el_spool_last_addr[1] =
221 REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
222 }
223 }
224
225
226 PI_SPOOL_SIZE_BYTES =
227 ERR_STACK_SIZE_BYTES(REMOTE_HUB_L(nasid, PI_ERR_STACK_SIZE));
228
229 #ifdef BRINGUP
230 /* BRINGUP: The following code looks like a check to make sure
231 the prom set up the error spool correctly for 2 processors. I
232 don't think it is needed. */
233 for(sn=0; sn<NUM_SUBNODES; sn++) {
234 if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_B)) {
235 __psunsigned_t addr_a = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_A);
236 __psunsigned_t addr_b = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
237 if ((addr_a & ~0xff) == (addr_b & ~0xff)) {
238 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STACK_ADDR_B,
239 addr_b + PI_SPOOL_SIZE_BYTES);
240
241 SN0_ERROR_LOG(cnode)->el_spool_cur_addr[1] =
242 SN0_ERROR_LOG(cnode)->el_spool_last_addr[1] =
243 REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
244
245 }
246 }
247 }
248 #endif /* BRINGUP */
249
250 /* programming our own hub. Enable error_int_pend intr.
251 * If both present, CPU A takes CPU b's error interrupts and any
252 * generic ones. CPU B takes CPU A error ints.
253 */
254 if (cause_intr_connect (SRB_ERR_IDX,
255 (intr_func_t)(hubpi_eint_handler),
256 SR_ALL_MASK|SR_IE)) {
257 cmn_err(ERR_WARN,
258 "hub_error_init: cause_intr_connect failed on %d", cnode);
259 }
260 }
261 else {
262 /* programming remote hub. The only valid reason that this
263 * is called will be on headless hubs. No interrupts
264 */
265 for(sn=0; sn<NUM_SUBNODES; sn++) {
266 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A, 0); /* not necessary */
267 REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B, 0); /* not necessary */
268 }
269 }
270 #endif /* ajm */
271 /*
272 * Now setup the hub ii and ni error interrupt handler.
273 */
274
275 hubii_eint_init(cnode);
276 hubni_eint_init(cnode);
277
278 #ifdef ajm
279 /*** XXX FIXME XXX resolve the following***/
280 /* INT_PEND1 bits set up for one hub only:
281 * SHUTDOWN_INTR
282 * MD_COR_ERR_INTR
283 * COR_ERR_INTR_A and COR_ERR_INTR_B should be sent to the
284 * appropriate CPU only.
285 */
286
287 if (cnode == 0) {
288 error_consistency_check.eps_state = 0;
289 error_consistency_check.eps_cpuid = -1;
290 spinlock_init(&error_consistency_check.eps_lock, "error_dump_lock");
291 }
292 #endif
293
294 nodepda->huberror_ticks = HUB_ERROR_PERIOD;
295 return;
296 }
297
298 /*
299 * Function : hubii_eint_init
300 * Parameters : cnode
301 * Purpose : to initialize the hub iio error interrupt.
302 * Assumptions : Called once per hub, by the cpu which will ultimately
303 * handle this interrupt.
304 * Returns : None.
305 */
306
307
308 void
309 hubii_eint_init(cnodeid_t cnode)
310 {
311 int bit, rv;
312 ii_iidsr_u_t hubio_eint;
313 hubinfo_t hinfo;
314 cpuid_t intr_cpu;
315 devfs_handle_t hub_v;
316 ii_ilcsr_u_t ilcsr;
317
318 hub_v = (devfs_handle_t)cnodeid_to_vertex(cnode);
319 ASSERT_ALWAYS(hub_v);
320 hubinfo_get(hub_v, &hinfo);
321
322 ASSERT(hinfo);
323 ASSERT(hinfo->h_cnodeid == cnode);
324
325 ilcsr.ii_ilcsr_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_ILCSR);
326
327 if ((ilcsr.ii_ilcsr_fld_s.i_llp_stat & 0x2) == 0) {
328 /*
329 * HUB II link is not up.
330 * Just disable LLP, and don't connect any interrupts.
331 */
332 ilcsr.ii_ilcsr_fld_s.i_llp_en = 0;
333 REMOTE_HUB_S(hinfo->h_nasid, IIO_ILCSR, ilcsr.ii_ilcsr_regval);
334 return;
335 }
336 /* Select a possible interrupt target where there is a free interrupt
337 * bit and also reserve the interrupt bit for this IO error interrupt
338 */
339 intr_cpu = intr_heuristic(hub_v,0,INTRCONNECT_ANYBIT,II_ERRORINT,hub_v,
340 "HUB IO error interrupt",&bit);
341 if (intr_cpu == CPU_NONE) {
342 printk("hubii_eint_init: intr_reserve_level failed, cnode %d", cnode);
343 return;
344 }
345
346 rv = intr_connect_level(intr_cpu, bit, 0,(intr_func_t)(NULL),
347 (void *)(long)hub_v, NULL);
348 synergy_intr_connect(bit, intr_cpu);
349 request_irq(bit_pos_to_irq(bit) + (intr_cpu << 8), hubii_eint_handler, 0, NULL, (void *)hub_v);
350 ASSERT_ALWAYS(rv >= 0);
351 hubio_eint.ii_iidsr_regval = 0;
352 hubio_eint.ii_iidsr_fld_s.i_enable = 1;
353 hubio_eint.ii_iidsr_fld_s.i_level = bit;/* Take the least significant bits*/
354 hubio_eint.ii_iidsr_fld_s.i_node = COMPACT_TO_NASID_NODEID(cnode);
355 hubio_eint.ii_iidsr_fld_s.i_pi_id = cpuid_to_subnode(intr_cpu);
356 REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, hubio_eint.ii_iidsr_regval);
357
358 }
359
360 void
361 hubni_eint_init(cnodeid_t cnode)
362 {
363 int intr_bit;
364 cpuid_t targ;
365
366
367 if ((targ = cnodeid_to_cpuid(cnode)) == CPU_NONE)
368 return;
369
370 /* The prom chooses which cpu gets these interrupts, but we
371 * don't know which one it chose. We will register all of the
372 * cpus to be sure. This only costs us an irqaction per cpu.
373 */
374 for (; targ < CPUS_PER_NODE; targ++) {
375 if (!cpu_enabled(targ) ) continue;
376 /* connect the INTEND1 bits. */
377 for (intr_bit = XB_ERROR; intr_bit <= MSC_PANIC_INTR; intr_bit++) {
378 intr_connect_level(targ, intr_bit, II_ERRORINT, NULL, NULL, NULL);
379 }
380 request_irq(SGI_HUB_ERROR_IRQ + (targ << 8), snia_error_intr_handler, 0, NULL, NULL);
381 /* synergy masks are initialized in the prom to enable all interrupts. */
382 /* We'll just leave them that way, here, for these interrupts. */
383 }
384 }
385
386
387 /*ARGSUSED*/
388 void
389 hubii_eint_handler (int irq, void *arg, struct pt_regs *ep)
390 {
391 devfs_handle_t hub_v;
392 hubinfo_t hinfo;
393 ii_wstat_u_t wstat;
394 hubreg_t idsr;
395
396 panic("Hubii interrupt\n");
397 #ifdef ajm
398 /*
399 * If the NI has a problem, everyone has a problem. We shouldn't
400 * even attempt to handle other errors when an NI error is present.
401 */
402 if (check_ni_errors()) {
403 hubni_error_handler("II interrupt", 1);
404 /* NOTREACHED */
405 }
406
407 /* two levels of casting avoids compiler warning.!! */
408 hub_v = (devfs_handle_t)(long)(arg);
409 ASSERT(hub_v);
410
411 hubinfo_get(hub_v, &hinfo);
412
413 /*
414 * Identify the reason for error.
415 */
416 wstat.ii_wstat_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_WSTAT);
417
418 if (wstat.ii_wstat_fld_s.w_crazy) {
419 char *reason;
420 /*
421 * We can do a couple of things here.
422 * Look at the fields TX_MX_RTY/XT_TAIL_TO/XT_CRD_TO to check
423 * which of these caused the CRAZY bit to be set.
424 * You may be able to check if the Link is up really.
425 */
426 if (wstat.ii_wstat_fld_s.w_tx_mx_rty)
427 reason = "Micro Packet Retry Timeout";
428 else if (wstat.ii_wstat_fld_s.w_xt_tail_to)
429 reason = "Crosstalk Tail Timeout";
430 else if (wstat.ii_wstat_fld_s.w_xt_crd_to)
431 reason = "Crosstalk Credit Timeout";
432 else {
433 hubreg_t hubii_imem;
434 /*
435 * Check if widget 0 has been marked as shutdown, or
436 * if BTE 0/1 has been marked.
437 */
438 hubii_imem = REMOTE_HUB_L(hinfo->h_nasid, IIO_IMEM);
439 if (hubii_imem & IIO_IMEM_W0ESD)
440 reason = "Hub Widget 0 has been Shutdown";
441 else if (hubii_imem & IIO_IMEM_B0ESD)
442 reason = "BTE 0 has been shutdown";
443 else if (hubii_imem & IIO_IMEM_B1ESD)
444 reason = "BTE 1 has been shutdown";
445 else reason = "Unknown";
446
447 }
448 /*
449 * Note: we may never be able to print this, if the II talking
450 * to Xbow which hosts the console is dead.
451 */
452 printk("Hub %d to Xtalk Link failed (II_ECRAZY) Reason: %s",
453 hinfo->h_cnodeid, reason);
454 }
455
456 /*
457 * It's a toss as to which one among PRB/CRB to check first.
458 * Current decision is based on the severity of the errors.
459 * IO CRB errors tend to be more severe than PRB errors.
460 *
461 * It is possible for BTE errors to have been handled already, so we
462 * may not see any errors handled here.
463 */
464 (void)hubiio_crb_error_handler(hub_v, hinfo);
465 (void)hubiio_prb_error_handler(hub_v, hinfo);
466 /*
467 * If we reach here, it indicates crb/prb handlers successfully
468 * handled the error. So, re-enable II to send more interrupt
469 * and return.
470 */
471 REMOTE_HUB_S(hinfo->h_nasid, IIO_IECLR, 0xffffff);
472 idsr = REMOTE_HUB_L(hinfo->h_nasid, IIO_IIDSR) & ~IIO_IIDSR_SENT_MASK;
473 REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, idsr);
474 #endif /* ajm */
475 }
476