arch/ia64/sn/kernel/bte_error.c at v4.19 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / arch / ia64 / sn / kernel / bte_error.c
at v4.19 255 lines 7.5 kB view raw
  1/*
  2 * This file is subject to the terms and conditions of the GNU General Public
  3 * License.  See the file "COPYING" in the main directory of this archive
  4 * for more details.
  5 *
  6 * Copyright (c) 2000-2007 Silicon Graphics, Inc.  All Rights Reserved.
  7 */
  8
  9#include <linux/types.h>
 10#include <asm/sn/sn_sal.h>
 11#include "ioerror.h"
 12#include <asm/sn/addrs.h>
 13#include <asm/sn/shubio.h>
 14#include <asm/sn/geo.h>
 15#include "xtalk/xwidgetdev.h"
 16#include "xtalk/hubdev.h"
 17#include <asm/sn/bte.h>
 18#include <asm/param.h>
 19
 20/*
 21 * Bte error handling is done in two parts.  The first captures
 22 * any crb related errors.  Since there can be multiple crbs per
 23 * interface and multiple interfaces active, we need to wait until
 24 * all active crbs are completed.  This is the first job of the
 25 * second part error handler.  When all bte related CRBs are cleanly
 26 * completed, it resets the interfaces and gets them ready for new
 27 * transfers to be queued.
 28 */
 29
 30/*
 31 * Wait until all BTE related CRBs are completed
 32 * and then reset the interfaces.
 33 */
 34static int shub1_bte_error_handler(struct nodepda_s *err_nodepda)
 35{
 36	struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
 37	nasid_t nasid;
 38	int i;
 39	int valid_crbs;
 40	ii_imem_u_t imem;	/* II IMEM Register */
 41	ii_icrb0_d_u_t icrbd;	/* II CRB Register D */
 42	ii_ibcr_u_t ibcr;
 43	ii_icmr_u_t icmr;
 44	ii_ieclr_u_t ieclr;
 45
 46	BTE_PRINTK(("shub1_bte_error_handler(%p) - %d\n", err_nodepda,
 47		    smp_processor_id()));
 48
 49	if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
 50	    (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
 51		BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
 52			    smp_processor_id()));
 53		return 1;
 54	}
 55
 56	/* Determine information about our hub */
 57	nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
 58
 59	/*
 60	 * A BTE transfer can use multiple CRBs.  We need to make sure
 61	 * that all the BTE CRBs are complete (or timed out) before
 62	 * attempting to clean up the error.  Resetting the BTE while
 63	 * there are still BTE CRBs active will hang the BTE.
 64	 * We should look at all the CRBs to see if they are allocated
 65	 * to the BTE and see if they are still active.  When none
 66	 * are active, we can continue with the cleanup.
 67	 *
 68	 * We also want to make sure that the local NI port is up.
 69	 * When a router resets the NI port can go down, while it
 70	 * goes through the LLP handshake, but then comes back up.
 71	 */
 72	icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
 73	if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
 74		/*
 75		 * There are errors which still need to be cleaned up by
 76		 * hubiio_crb_error_handler
 77		 */
 78		mod_timer(recovery_timer, jiffies + (HZ * 5));
 79		BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
 80			    smp_processor_id()));
 81		return 1;
 82	}
 83	if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
 84
 85		valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
 86
 87		for (i = 0; i < IIO_NUM_CRBS; i++) {
 88			if (!((1 << i) & valid_crbs)) {
 89				/* This crb was not marked as valid, ignore */
 90				continue;
 91			}
 92			icrbd.ii_icrb0_d_regval =
 93			    REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
 94			if (icrbd.d_bteop) {
 95				mod_timer(recovery_timer, jiffies + (HZ * 5));
 96				BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
 97					    err_nodepda, smp_processor_id(),
 98					    i));
 99				return 1;
100			}
101		}
102	}
103
104	BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
105	/* Re-enable both bte interfaces */
106	imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
107	imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
108	REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
109
110	/* Clear BTE0/1 error bits */
111	ieclr.ii_ieclr_regval = 0;
112	if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
113		ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
114	if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
115		ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
116	REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
117
118	/* Reinitialize both BTE state machines. */
119	ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
120	ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
121	REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
122
123	del_timer(recovery_timer);
124	return 0;
125}
126
127/*
128 * Wait until all BTE related CRBs are completed
129 * and then reset the interfaces.
130 */
131static int shub2_bte_error_handler(struct nodepda_s *err_nodepda)
132{
133	struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
134	struct bteinfo_s *bte;
135	nasid_t nasid;
136	u64 status;
137	int i;
138
139	nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
140
141	/*
142	 * Verify that all the BTEs are complete
143	 */
144	for (i = 0; i < BTES_PER_NODE; i++) {
145		bte = &err_nodepda->bte_if[i];
146		status = BTE_LNSTAT_LOAD(bte);
147		if (status & IBLS_ERROR) {
148			bte->bh_error = BTE_SHUB2_ERROR(status);
149			continue;
150		}
151		if (!(status & IBLS_BUSY))
152			continue;
153		mod_timer(recovery_timer, jiffies + (HZ * 5));
154		BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
155			    smp_processor_id()));
156		return 1;
157	}
158	if (ia64_sn_bte_recovery(nasid))
159		panic("bte_error_handler(): Fatal BTE Error");
160
161	del_timer(recovery_timer);
162	return 0;
163}
164
165/*
166 * Wait until all BTE related CRBs are completed
167 * and then reset the interfaces.
168 */
169void bte_error_handler(struct nodepda_s *err_nodepda)
170{
171	spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
172	int i;
173	unsigned long irq_flags;
174	volatile u64 *notify;
175	bte_result_t bh_error;
176
177	BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
178		    smp_processor_id()));
179
180	spin_lock_irqsave(recovery_lock, irq_flags);
181
182	/*
183	 * Lock all interfaces on this node to prevent new transfers
184	 * from being queued.
185	 */
186	for (i = 0; i < BTES_PER_NODE; i++) {
187		if (err_nodepda->bte_if[i].cleanup_active) {
188			continue;
189		}
190		spin_lock(&err_nodepda->bte_if[i].spinlock);
191		BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
192			    smp_processor_id(), i));
193		err_nodepda->bte_if[i].cleanup_active = 1;
194	}
195
196	if (is_shub1()) {
197		if (shub1_bte_error_handler(err_nodepda)) {
198			spin_unlock_irqrestore(recovery_lock, irq_flags);
199			return;
200		}
201	} else {
202		if (shub2_bte_error_handler(err_nodepda)) {
203			spin_unlock_irqrestore(recovery_lock, irq_flags);
204			return;
205		}
206	}
207
208	for (i = 0; i < BTES_PER_NODE; i++) {
209		bh_error = err_nodepda->bte_if[i].bh_error;
210		if (bh_error != BTE_SUCCESS) {
211			/* There is an error which needs to be notified */
212			notify = err_nodepda->bte_if[i].most_rcnt_na;
213			BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
214				    err_nodepda->bte_if[i].bte_cnode,
215				    err_nodepda->bte_if[i].bte_num,
216				    IBLS_ERROR | (u64) bh_error));
217			*notify = IBLS_ERROR | bh_error;
218			err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
219		}
220
221		err_nodepda->bte_if[i].cleanup_active = 0;
222		BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
223			    smp_processor_id(), i));
224		spin_unlock(&err_nodepda->bte_if[i].spinlock);
225	}
226
227	spin_unlock_irqrestore(recovery_lock, irq_flags);
228}
229
230/*
231 * First part error handler.  This is called whenever any error CRB interrupt
232 * is generated by the II.
233 */
234void
235bte_crb_error_handler(cnodeid_t cnode, int btenum,
236                      int crbnum, ioerror_t * ioe, int bteop)
237{
238	struct bteinfo_s *bte;
239
240
241	bte = &(NODEPDA(cnode)->bte_if[btenum]);
242
243	/*
244	 * The caller has already figured out the error type, we save that
245	 * in the bte handle structure for the thread exercising the
246	 * interface to consume.
247	 */
248	bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
249	bte->bte_error_count++;
250
251	BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
252		bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
253	bte_error_handler(NODEPDA(cnode));
254}
255