File: /usr/src/linux/arch/ia64/sn/io/huberror.c

1     /* $Id$
2      *
3      * This file is subject to the terms and conditions of the GNU General Public
4      * License.  See the file "COPYING" in the main directory of this archive
5      * for more details.
6      *
7      * Copyright (C) 1992 - 1997, 2000 Silicon Graphics, Inc.
8      * Copyright (C) 2000 by Alan Mayer
9      */
10     
11     
12     #include <linux/types.h>
13     #include <linux/slab.h>
14     #include <asm/smp.h>
15     #include <asm/sn/sgi.h>
16     #include <asm/sn/iograph.h>
17     #include <asm/sn/invent.h>
18     #include <asm/sn/hcl.h>
19     #include <asm/sn/labelcl.h>
20     #include <asm/sn/nodemask.h>
21     #include <asm/sn/sn_private.h>
22     #include <asm/sn/klconfig.h>
23     #include <asm/sn/synergy.h>
24     #include <asm/sn/sn_cpuid.h>
25     #include <asm/sn/pci/pciio.h>
26     #include <asm/sn/pci/pcibr.h>
27     #include <asm/sn/xtalk/xtalk.h>
28     #include <asm/sn/pci/pcibr_private.h>
29     #include <asm/sn/intr.h>
30     
31     extern void hubni_eint_init(cnodeid_t cnode);
32     extern void hubii_eint_init(cnodeid_t cnode);
33     extern void hubii_eint_handler (int irq, void *arg, struct pt_regs *ep);
34     extern void snia_error_intr_handler(int irq, void *devid, struct pt_regs *pt_regs);
35     
36     extern int maxcpus;
37     
38     #define HUB_ERROR_PERIOD        (120 * HZ)      /* 2 minutes */
39     
40     
41     void
42     hub_error_clear(nasid_t nasid)
43     {
44     	int i;
45     	hubreg_t idsr;
46     	int sn;
47     
48     	for(sn=0; sn<NUM_SUBNODES; sn++) {
49     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_PEND, -1);
50     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS0_A_CLR, -1);
51     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS0_B_CLR, -1);
52     		REMOTE_HUB_PI_S(nasid, sn, PI_SPURIOUS_HDR_0, 0);
53     		REMOTE_HUB_PI_S(nasid, sn, PI_SPURIOUS_HDR_1, 0);
54     	}
55     
56     	REMOTE_HUB_L(nasid, MD_DIR_ERROR_CLR);
57     	REMOTE_HUB_L(nasid, MD_MEM_ERROR_CLR);
58     	REMOTE_HUB_L(nasid, MD_MISC1_ERROR_CLR);
59     	REMOTE_HUB_L(nasid, MD_PROTOCOL_ERR_CLR);
60     
61         /*
62          * Make sure spurious write response errors are cleared
63          * (values are from hub_set_prb())
64          */
65         for (i = 0; i <= HUB_WIDGET_ID_MAX - HUB_WIDGET_ID_MIN + 1; i++) {
66             iprb_t prb;
67     
68     	prb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)));
69     
70             /* Clear out some fields */
71             prb.iprb_ovflow = 1;
72             prb.iprb_bnakctr = 0;
73             prb.iprb_anakctr = 0;
74     
75     	/*
76     	 * PIO reads in fire-and-forget mode on bedrock 1.0 don't
77     	 * frob the credit count properly, making the responses appear
78     	 * spurious.  So don't use fire-and-forget mode.  Bug 761802.
79     	 */
80             prb.iprb_ff = 0;        /* disable fire-and-forget mode by default */
81     
82             prb.iprb_xtalkctr = 3;  /* approx. PIO credits for the widget */
83     
84             REMOTE_HUB_S(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)), prb.iprb_regval);
85         }
86     
87         REMOTE_HUB_S(nasid, IIO_IO_ERR_CLR, -1);
88         idsr = REMOTE_HUB_L(nasid, IIO_IIDSR);
89         REMOTE_HUB_S(nasid, IIO_IIDSR, (idsr & ~(IIO_IIDSR_SENT_MASK)));
90     
91         REMOTE_HUB_L(nasid, NI_PORT_ERROR_CLEAR);
92         /* No need to clear NI_PORT_HEADER regs; they are continually overwritten*/
93     
94         REMOTE_HUB_S(nasid, LB_ERROR_MASK_CLR, -1);
95         REMOTE_HUB_S(nasid, LB_ERROR_HDR1, 0);
96     
97         /* Clear XB error regs, in order */
98         for (i = 0;
99              i <= XB_FIRST_ERROR_CLEAR - XB_POQ0_ERROR_CLEAR;
100              i += sizeof(hubreg_t)) {
101             REMOTE_HUB_S(nasid, XB_POQ0_ERROR_CLEAR + i, 0);
102         }
103     }
104     
105     
106     /*
107      * Function	: hub_error_init
108      * Purpose	: initialize the error handling requirements for a given hub.
109      * Parameters	: cnode, the compact nodeid.
110      * Assumptions	: Called only once per hub, either by a local cpu. Or by a 
111      *			remote cpu, when this hub is headless.(cpuless)
112      * Returns	: None
113      */
114     
115     void
116     hub_error_init(cnodeid_t cnode)
117     {
118     	nasid_t nasid;
119     
120         nasid = cnodeid_to_nasid(cnode);
121         hub_error_clear(nasid);
122     
123     #ifdef ajm
124         if (cnode == 0) {
125     	/*
126     	 * Allocate log for storing the node specific error info
127     	 */
128     	for (i = 0; i < numnodes; i++) {
129     	    kl_error_log[i]  = kmem_zalloc_node(sizeof(sn0_error_log_t), 
130     						KM_NOSLEEP, i);
131     	    hub_err_count[i] = kmem_zalloc_node(sizeof(hub_errcnt_t),
132     						VM_DIRECT | KM_NOSLEEP, i);
133     	    ASSERT_ALWAYS(kl_error_log[i] && hub_err_count[i]);
134     	}
135         }
136     
137         /*
138          * Assumption: There will be only one cpu who will initialize
139          * a hub. we need to setup the ii and each pi error interrupts.
140          * The SN1 hub (bedrock) has two PI, one for up to two processors.
141          */
142     
143         if (cpuid_to_cnodeid(smp_processor_id()) == cnode) { 
144     	int generic_intr_mask = PI_ERR_GENERIC; /* These interrupts are sent to only 1 CPU per NODE */
145     
146     	ASSERT_ALWAYS(kl_error_log[cnode]);
147     	ASSERT_ALWAYS(hub_err_count[cnode]);
148     	MD_ERR_LOG_INIT(kl_error_log[cnode]);
149     
150     	/* One for each CPU */
151     	recover_error_init(RECOVER_ERROR_TABLE(cnode, 0));
152     	recover_error_init(RECOVER_ERROR_TABLE(cnode, 1));
153     	recover_error_init(RECOVER_ERROR_TABLE(cnode, 2));
154     	recover_error_init(RECOVER_ERROR_TABLE(cnode, 3));
155     
156     	/*
157     	 * Setup error intr masks.
158     	 */
159     	for(sn=0; sn<NUM_SUBNODES; sn++) {
160     		int cpuA_present = REMOTE_HUB_PI_L(nasid, sn, PI_CPU_ENABLE_A);
161     		int cpuB_present = REMOTE_HUB_PI_L(nasid, sn, PI_CPU_ENABLE_B);
162     
163     		if (cpuA_present) {
164     			if (cpuB_present) {		/* A && B */
165     	    			REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A,
166     					(PI_FATAL_ERR_CPU_B | PI_MISC_ERR_CPU_A|generic_intr_mask));
167     	    			REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B,
168     					(PI_FATAL_ERR_CPU_A | PI_MISC_ERR_CPU_B));
169     
170     			} else {			/* A && !B */
171     	    			REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A,
172     					(PI_FATAL_ERR_CPU_A | PI_MISC_ERR_CPU_A|generic_intr_mask));
173     			}
174     			generic_intr_mask = 0;
175     		} else {
176     			if (cpuB_present) {		/* !A && B */
177     	    			REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B,
178     					(PI_FATAL_ERR_CPU_B | PI_MISC_ERR_CPU_B|generic_intr_mask));
179     				generic_intr_mask = 0;
180     
181     			} else {			/* !A && !B */
182     				/* nothing to set up */
183     			}
184     		}
185     	}
186     
187     	/*
188     	 * Turn off UNCAC_UNCORR interrupt in the masks. Anyone interested
189     	 * in these errors will peek at the int pend register to see if its
190     	 * set.
191     	 */ 
192     	for(sn=0; sn<NUM_SUBNODES; sn++) {
193     		misc = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_INT_MASK_A);
194     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A, (misc & ~PI_ERR_UNCAC_UNCORR_A));
195     		misc = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_INT_MASK_B);
196     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B, (misc & ~PI_ERR_UNCAC_UNCORR_B));
197     	}
198     
199     	/*
200     	 * enable all error indicators to turn on, in case of errors.
201     	 *
202     	 * This is not good on single cpu node boards.
203     	 **** LOCAL_HUB_S(PI_SYSAD_ERRCHK_EN, PI_SYSAD_CHECK_ALL);
204     	 */
205     	for(sn=0; sn<NUM_SUBNODES; sn++) {
206     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS1_A_CLR, 0);
207     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STATUS1_B_CLR, 0);
208     	}
209     
210     	/* Set up stack for each present processor */
211     	for(sn=0; sn<NUM_SUBNODES; sn++) {
212     		if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_A)) {
213     	    	SN0_ERROR_LOG(cnode)->el_spool_cur_addr[0] =
214     			SN0_ERROR_LOG(cnode)->el_spool_last_addr[0] =
215     		    	REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_A);
216     		}
217     	    
218     		if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_B)) {
219     	    	SN0_ERROR_LOG(cnode)->el_spool_cur_addr[1] =
220     			SN0_ERROR_LOG(cnode)->el_spool_last_addr[1] =
221     		    	REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
222     		}
223     	}
224     
225     
226     	PI_SPOOL_SIZE_BYTES = 
227     	    ERR_STACK_SIZE_BYTES(REMOTE_HUB_L(nasid, PI_ERR_STACK_SIZE));
228     
229     #ifdef BRINGUP
230     /* BRINGUP: The following code looks like a check to make sure
231     the prom set up the error spool correctly for 2 processors.  I
232     don't think it is needed.  */
233     	for(sn=0; sn<NUM_SUBNODES; sn++) {
234     		if (REMOTE_HUB_PI_L(nasid, sn, PI_CPU_PRESENT_B)) {
235     			__psunsigned_t addr_a = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_A);
236     			__psunsigned_t addr_b = REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
237     			if ((addr_a & ~0xff) == (addr_b & ~0xff)) {
238     			    REMOTE_HUB_PI_S(nasid, sn, PI_ERR_STACK_ADDR_B, 	
239     					addr_b + PI_SPOOL_SIZE_BYTES);
240     	
241     			    SN0_ERROR_LOG(cnode)->el_spool_cur_addr[1] =
242     				SN0_ERROR_LOG(cnode)->el_spool_last_addr[1] =
243     				    REMOTE_HUB_PI_L(nasid, sn, PI_ERR_STACK_ADDR_B);
244     	
245     		    }
246     		}
247     	}
248     #endif /* BRINGUP */
249     
250     	/* programming our own hub. Enable error_int_pend intr.
251     	 * If both present, CPU A takes CPU b's error interrupts and any
252     	 * generic ones. CPU B takes CPU A error ints.
253     	 */
254     	if (cause_intr_connect (SRB_ERR_IDX,
255     				(intr_func_t)(hubpi_eint_handler),
256     				SR_ALL_MASK|SR_IE)) {
257     	    cmn_err(ERR_WARN, 
258     		    "hub_error_init: cause_intr_connect failed on %d", cnode);
259     	}
260         }
261         else {
262     	/* programming remote hub. The only valid reason that this
263     	 * is called will be on headless hubs. No interrupts 
264     	 */
265     	for(sn=0; sn<NUM_SUBNODES; sn++) {
266     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_A, 0); /* not necessary */
267     		REMOTE_HUB_PI_S(nasid, sn, PI_ERR_INT_MASK_B, 0); /* not necessary */
268     	}
269         }
270     #endif /* ajm */
271         /*
272          * Now setup the hub ii and ni error interrupt handler.
273          */
274     
275         hubii_eint_init(cnode);
276         hubni_eint_init(cnode);
277     
278     #ifdef ajm
279         /*** XXX FIXME XXX resolve the following***/
280         /* INT_PEND1 bits set up for one hub only:
281          *	SHUTDOWN_INTR
282          *	MD_COR_ERR_INTR
283          *  COR_ERR_INTR_A and COR_ERR_INTR_B should be sent to the
284          *  appropriate CPU only.
285          */
286     
287         if (cnode == 0) {
288     	    error_consistency_check.eps_state = 0;
289     	    error_consistency_check.eps_cpuid = -1;
290     	    spinlock_init(&error_consistency_check.eps_lock, "error_dump_lock");
291         }
292     #endif
293     
294         nodepda->huberror_ticks = HUB_ERROR_PERIOD;
295         return;
296     }
297     
298     /*
299      * Function	: hubii_eint_init
300      * Parameters	: cnode
301      * Purpose	: to initialize the hub iio error interrupt.
302      * Assumptions	: Called once per hub, by the cpu which will ultimately
303      *			handle this interrupt.
304      * Returns	: None.
305      */
306     
307     
308     void
309     hubii_eint_init(cnodeid_t cnode)
310     {
311         int			bit, rv;
312         ii_iidsr_u_t    	hubio_eint;
313         hubinfo_t		hinfo; 
314         cpuid_t		intr_cpu;
315         devfs_handle_t 	hub_v;
316         ii_ilcsr_u_t	ilcsr;
317     
318         hub_v = (devfs_handle_t)cnodeid_to_vertex(cnode);
319         ASSERT_ALWAYS(hub_v);
320         hubinfo_get(hub_v, &hinfo);
321     
322         ASSERT(hinfo);
323         ASSERT(hinfo->h_cnodeid == cnode);
324     
325         ilcsr.ii_ilcsr_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_ILCSR);
326     
327         if ((ilcsr.ii_ilcsr_fld_s.i_llp_stat & 0x2) == 0) {
328     	/* 
329     	 * HUB II link is not up. 
330     	 * Just disable LLP, and don't connect any interrupts.
331     	 */
332     	ilcsr.ii_ilcsr_fld_s.i_llp_en = 0;
333     	REMOTE_HUB_S(hinfo->h_nasid, IIO_ILCSR, ilcsr.ii_ilcsr_regval);
334     	return;
335         }
336         /* Select a possible interrupt target where there is a free interrupt
337          * bit and also reserve the interrupt bit for this IO error interrupt
338          */
339         intr_cpu = intr_heuristic(hub_v,0,INTRCONNECT_ANYBIT,II_ERRORINT,hub_v,
340     			      "HUB IO error interrupt",&bit);
341         if (intr_cpu == CPU_NONE) {
342     	printk("hubii_eint_init: intr_reserve_level failed, cnode %d", cnode);
343     	return;
344         }
345     	
346         rv = intr_connect_level(intr_cpu, bit, 0,(intr_func_t)(NULL),
347     			    (void *)(long)hub_v, NULL);
348         synergy_intr_connect(bit, intr_cpu);
349         request_irq(bit_pos_to_irq(bit) + (intr_cpu << 8), hubii_eint_handler, 0, NULL, (void *)hub_v);
350         ASSERT_ALWAYS(rv >= 0);
351         hubio_eint.ii_iidsr_regval = 0;
352         hubio_eint.ii_iidsr_fld_s.i_enable = 1;
353         hubio_eint.ii_iidsr_fld_s.i_level = bit;/* Take the least significant bits*/
354         hubio_eint.ii_iidsr_fld_s.i_node = COMPACT_TO_NASID_NODEID(cnode);
355         hubio_eint.ii_iidsr_fld_s.i_pi_id = cpuid_to_subnode(intr_cpu);
356         REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, hubio_eint.ii_iidsr_regval);
357     
358     }
359     
360     void
361     hubni_eint_init(cnodeid_t cnode)
362     {
363         int intr_bit;
364         cpuid_t targ;
365     
366     
367         if ((targ = cnodeid_to_cpuid(cnode)) == CPU_NONE)
368     	return;
369     
370     	/* The prom chooses which cpu gets these interrupts, but we
371     	*  don't know which one it chose.  We will register all of the 
372     	*  cpus to be sure.  This only costs us an irqaction per cpu.
373     	*/
374         for (; targ < CPUS_PER_NODE; targ++) {
375     	if (!cpu_enabled(targ) ) continue;
376     	/* connect the INTEND1 bits. */
377     	for (intr_bit = XB_ERROR; intr_bit <= MSC_PANIC_INTR; intr_bit++) {
378     		intr_connect_level(targ, intr_bit, II_ERRORINT, NULL, NULL, NULL);
379     	}
380     	request_irq(SGI_HUB_ERROR_IRQ + (targ << 8), snia_error_intr_handler, 0, NULL, NULL);
381     	/* synergy masks are initialized in the prom to enable all interrupts. */
382     	/* We'll just leave them that way, here, for these interrupts. */
383         }
384     }
385     
386     
387     /*ARGSUSED*/
388     void
389     hubii_eint_handler (int irq, void *arg, struct pt_regs *ep)
390     {
391         devfs_handle_t	hub_v;
392         hubinfo_t		hinfo; 
393         ii_wstat_u_t	wstat;
394         hubreg_t		idsr;
395     
396     	panic("Hubii interrupt\n");
397     #ifdef ajm
398         /*
399          * If the NI has a problem, everyone has a problem.  We shouldn't
400          * even attempt to handle other errors when an NI error is present.
401          */
402         if (check_ni_errors()) {
403     	hubni_error_handler("II interrupt", 1);
404     	/* NOTREACHED */
405         }
406     
407         /* two levels of casting avoids compiler warning.!! */
408         hub_v = (devfs_handle_t)(long)(arg); 
409         ASSERT(hub_v);
410     
411         hubinfo_get(hub_v, &hinfo);
412         
413         /* 
414          * Identify the reason for error. 
415          */
416         wstat.ii_wstat_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_WSTAT);
417     
418         if (wstat.ii_wstat_fld_s.w_crazy) {
419     	char	*reason;
420     	/*
421     	 * We can do a couple of things here. 
422     	 * Look at the fields TX_MX_RTY/XT_TAIL_TO/XT_CRD_TO to check
423     	 * which of these caused the CRAZY bit to be set. 
424     	 * You may be able to check if the Link is up really.
425     	 */
426     	if (wstat.ii_wstat_fld_s.w_tx_mx_rty)
427     		reason = "Micro Packet Retry Timeout";
428     	else if (wstat.ii_wstat_fld_s.w_xt_tail_to)
429     		reason = "Crosstalk Tail Timeout";
430     	else if (wstat.ii_wstat_fld_s.w_xt_crd_to)
431     		reason = "Crosstalk Credit Timeout";
432     	else {
433     		hubreg_t	hubii_imem;
434     		/*
435     		 * Check if widget 0 has been marked as shutdown, or
436     		 * if BTE 0/1 has been marked.
437     		 */
438     		hubii_imem = REMOTE_HUB_L(hinfo->h_nasid, IIO_IMEM);
439     		if (hubii_imem & IIO_IMEM_W0ESD)
440     			reason = "Hub Widget 0 has been Shutdown";
441     		else if (hubii_imem & IIO_IMEM_B0ESD)
442     			reason = "BTE 0 has been shutdown";
443     		else if (hubii_imem & IIO_IMEM_B1ESD)
444     			reason = "BTE 1 has been shutdown";
445     		else	reason = "Unknown";
446     	
447     	}
448     	/*
449     	 * Note: we may never be able to print this, if the II talking
450     	 * to Xbow which hosts the console is dead. 
451     	 */
452     	printk("Hub %d to Xtalk Link failed (II_ECRAZY) Reason: %s", 
453     		hinfo->h_cnodeid, reason);
454         }
455     
456         /* 
457          * It's a toss as to which one among PRB/CRB to check first. 
458          * Current decision is based on the severity of the errors. 
459          * IO CRB errors tend to be more severe than PRB errors.
460          *
461          * It is possible for BTE errors to have been handled already, so we
462          * may not see any errors handled here. 
463          */
464         (void)hubiio_crb_error_handler(hub_v, hinfo);
465         (void)hubiio_prb_error_handler(hub_v, hinfo);
466         /*
467          * If we reach here, it indicates crb/prb handlers successfully
468          * handled the error. So, re-enable II to send more interrupt
469          * and return.
470          */
471         REMOTE_HUB_S(hinfo->h_nasid, IIO_IECLR, 0xffffff);
472         idsr = REMOTE_HUB_L(hinfo->h_nasid, IIO_IIDSR) & ~IIO_IIDSR_SENT_MASK;
473         REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, idsr);
474     #endif /* ajm */
475     }
476