x86, UV: Improve BAU performance and error recovery

- increase performance of the interrupt handler

- release timed-out software acknowledge resources

- recover from continuous-busy status due to a hardware issue

- add a 'throttle' to keep a uvhub from sending more than a
specified number of broadcasts concurrently (work around the hardware issue)

- provide a 'nobau' boot command line option

- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
is ambiguous)

- add some new statistics about the scope of broadcasts, retries, the
hardware issue and the 'throttle'

- split off new function uv_bau_retry_msg() from
uv_bau_process_message() per community coding style feedback.

- simplify the argument list to uv_bau_process_message(), per
community coding style feedback.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar b8f7fb13 2acebe9e

+1084 -451
+174 -71
arch/x86/include/asm/uv/uv_bau.h
··· 27 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 28 28 * 29 29 * We will use 31 sets, one for sending BAU messages from each of the 32 30 - * cpu's on the node. 30 + * cpu's on the uvhub. 31 31 * 32 32 * TLB shootdown will use the first of the 8 descriptors of each set. 33 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 34 34 */ 35 35 36 36 #define UV_ITEMS_PER_DESCRIPTOR 8 37 + #define MAX_BAU_CONCURRENT 3 37 38 #define UV_CPUS_PER_ACT_STATUS 32 38 39 #define UV_ACT_STATUS_MASK 0x3 39 40 #define UV_ACT_STATUS_SIZE 2 ··· 46 45 #define UV_PAYLOADQ_PNODE_SHIFT 49 47 46 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 48 47 #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 48 + #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 49 + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 50 + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL 49 51 50 52 /* 51 53 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 ··· 59 55 #define DESC_STATUS_SOURCE_TIMEOUT 3 60 56 61 57 /* 62 - * source side thresholds at which message retries print a warning 58 + * source side threshholds at which message retries print a warning 63 59 */ 64 60 #define SOURCE_TIMEOUT_LIMIT 20 65 61 #define DESTINATION_TIMEOUT_LIMIT 20 66 62 67 63 /* 64 + * misc. delays, in microseconds 65 + */ 66 + #define THROTTLE_DELAY 10 67 + #define TIMEOUT_DELAY 10 68 + #define BIOS_TO 1000 69 + /* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */ 70 + 71 + /* 72 + * threshholds at which to use IPI to free resources 73 + */ 74 + #define PLUGSB4RESET 100 75 + #define TIMEOUTSB4RESET 100 76 + 77 + /* 68 78 * number of entries in the destination side payload queue 69 79 */ 70 - #define DEST_Q_SIZE 17 80 + #define DEST_Q_SIZE 20 71 81 /* 72 82 * number of destination side software ack resources 73 83 */ ··· 90 72 /* 91 73 * completion statuses for sending a TLB flush message 92 74 */ 93 - #define FLUSH_RETRY 1 94 - #define FLUSH_GIVEUP 2 95 - #define FLUSH_COMPLETE 3 75 + #define FLUSH_RETRY_PLUGGED 1 76 + #define FLUSH_RETRY_TIMEOUT 2 77 + #define FLUSH_GIVEUP 3 78 + #define FLUSH_COMPLETE 4 96 79 97 80 /* 98 81 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) ··· 105 86 * 'base_dest_nodeid' field of the header corresponds to the 106 87 * destination nodeID associated with that specified bit. 107 88 */ 108 - struct bau_target_nodemask { 109 - unsigned long bits[BITS_TO_LONGS(256)]; 89 + struct bau_target_uvhubmask { 90 + unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; 110 91 }; 111 92 112 93 /* 113 - * mask of cpu's on a node 94 + * mask of cpu's on a uvhub 114 95 * (during initialization we need to check that unsigned long has 115 - * enough bits for max. cpu's per node) 96 + * enough bits for max. cpu's per uvhub) 116 97 */ 117 98 struct bau_local_cpumask { 118 99 unsigned long bits; ··· 154 135 struct bau_msg_header { 155 136 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 156 137 /* bits 5:0 */ 157 - unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ 158 - /* bits 20:6 */ /* first bit in node_map */ 138 + unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ 139 + /* bits 20:6 */ /* first bit in uvhub map */ 159 140 unsigned int command:8; /* message type */ 160 141 /* bits 28:21 */ 161 142 /* 0x38: SN3net EndPoint Message */ ··· 165 146 unsigned int rsvd_2:9; /* must be zero */ 166 147 /* bits 40:32 */ 167 148 /* Suppl_A is 56-41 */ 168 - unsigned int payload_2a:8;/* becomes byte 16 of msg */ 169 - /* bits 48:41 */ /* not currently using */ 170 - unsigned int payload_2b:8;/* becomes byte 17 of msg */ 171 - /* bits 56:49 */ /* not currently using */ 149 + unsigned int sequence:16;/* message sequence number */ 150 + /* bits 56:41 */ /* becomes bytes 16-17 of msg */ 172 151 /* Address field (96:57) is never used as an 173 152 address (these are address bits 42:3) */ 153 + 174 154 unsigned int rsvd_3:1; /* must be zero */ 175 155 /* bit 57 */ 176 156 /* address bits 27:4 are payload */ 177 - /* these 24 bits become bytes 12-14 of msg */ 157 + /* these next 24 (58-81) bits become bytes 12-14 of msg */ 158 + 159 + /* bits 65:58 land in byte 12 */ 178 160 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ 179 161 /* bit 58 */ 162 + unsigned int msg_type:3; /* software type of the message*/ 163 + /* bits 61:59 */ 164 + unsigned int canceled:1; /* message canceled, resource to be freed*/ 165 + /* bit 62 */ 166 + unsigned int payload_1a:1;/* not currently used */ 167 + /* bit 63 */ 168 + unsigned int payload_1b:2;/* not currently used */ 169 + /* bits 65:64 */ 180 170 181 - unsigned int payload_1a:5;/* not currently used */ 182 - /* bits 63:59 */ 183 - unsigned int payload_1b:8;/* not currently used */ 184 - /* bits 71:64 */ 185 - unsigned int payload_1c:8;/* not currently used */ 186 - /* bits 79:72 */ 187 - unsigned int payload_1d:2;/* not currently used */ 171 + /* bits 73:66 land in byte 13 */ 172 + unsigned int payload_1ca:6;/* not currently used */ 173 + /* bits 71:66 */ 174 + unsigned int payload_1c:2;/* not currently used */ 175 + /* bits 73:72 */ 176 + 177 + /* bits 81:74 land in byte 14 */ 178 + unsigned int payload_1d:6;/* not currently used */ 179 + /* bits 79:74 */ 180 + unsigned int payload_1e:2;/* not currently used */ 188 181 /* bits 81:80 */ 189 182 190 183 unsigned int rsvd_4:7; /* must be zero */ ··· 209 178 /* bits 95:90 */ 210 179 unsigned int rsvd_6:5; /* must be zero */ 211 180 /* bits 100:96 */ 212 - unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */ 181 + unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */ 213 182 /* bit 101*/ 214 183 unsigned int fairness:3;/* usually zero */ 215 184 /* bits 104:102 */ ··· 222 191 /* bits 127:107 */ 223 192 }; 224 193 194 + /* see msg_type: */ 195 + #define MSG_NOOP 0 196 + #define MSG_REGULAR 1 197 + #define MSG_RETRY 2 198 + 225 199 /* 226 200 * The activation descriptor: 227 201 * The format of the message to send, plus all accompanying control 228 202 * Should be 64 bytes 229 203 */ 230 204 struct bau_desc { 231 - struct bau_target_nodemask distribution; 205 + struct bau_target_uvhubmask distribution; 232 206 /* 233 207 * message template, consisting of header and payload: 234 208 */ ··· 273 237 unsigned short acknowledge_count; /* filled in by destination */ 274 238 /* 16 bits, bytes 10-11 */ 275 239 276 - unsigned short replied_to:1; /* sent as 0 by the source */ 277 - /* 1 bit */ 278 - unsigned short unused1:7; /* not currently using */ 279 - /* 7 bits: byte 12) */ 240 + /* these next 3 bytes come from bits 58-81 of the message header */ 241 + unsigned short replied_to:1; /* sent as 0 by the source */ 242 + unsigned short msg_type:3; /* software message type */ 243 + unsigned short canceled:1; /* sent as 0 by the source */ 244 + unsigned short unused1:3; /* not currently using */ 245 + /* byte 12 */ 280 246 281 - unsigned char unused2[2]; /* not currently using */ 282 - /* bytes 13-14 */ 247 + unsigned char unused2a; /* not currently using */ 248 + /* byte 13 */ 249 + unsigned char unused2; /* not currently using */ 250 + /* byte 14 */ 283 251 284 252 unsigned char sw_ack_vector; /* filled in by the hardware */ 285 253 /* byte 15 (bits 127:120) */ 286 254 287 - unsigned char unused4[3]; /* not currently using bytes 17-19 */ 288 - /* bytes 17-19 */ 255 + unsigned short sequence; /* message sequence number */ 256 + /* bytes 16-17 */ 257 + unsigned char unused4[2]; /* not currently using bytes 18-19 */ 258 + /* bytes 18-19 */ 289 259 290 260 int number_of_cpus; /* filled in at destination */ 291 261 /* 32 bits, bytes 20-23 (aligned) */ ··· 301 259 }; 302 260 303 261 /* 304 - * one for every slot in the destination payload queue 305 - */ 306 - struct bau_msg_status { 307 - struct bau_local_cpumask seen_by; /* map of cpu's */ 308 - }; 309 - 310 - /* 311 - * one for every slot in the destination software ack resources 312 - */ 313 - struct bau_sw_ack_status { 314 - struct bau_payload_queue_entry *msg; /* associated message */ 315 - int watcher; /* cpu monitoring, or -1 */ 316 - }; 317 - 318 - /* 319 - * one on every node and per-cpu; to locate the software tables 262 + * one per-cpu; to locate the software tables 320 263 */ 321 264 struct bau_control { 322 265 struct bau_desc *descriptor_base; 323 - struct bau_payload_queue_entry *bau_msg_head; 324 266 struct bau_payload_queue_entry *va_queue_first; 325 267 struct bau_payload_queue_entry *va_queue_last; 326 - struct bau_msg_status *msg_statuses; 327 - int *watching; /* pointer to array */ 268 + struct bau_payload_queue_entry *bau_msg_head; 269 + struct bau_control *uvhub_master; 270 + struct bau_control *socket_master; 271 + unsigned long timeout_interval; 272 + atomic_t active_descriptor_count; 273 + int max_concurrent; 274 + int max_concurrent_constant; 275 + int retry_message_scans; 276 + int plugged_tries; 277 + int timeout_tries; 278 + int ipi_attempts; 279 + int conseccompletes; 280 + short cpu; 281 + short uvhub_cpu; 282 + short uvhub; 283 + short cpus_in_socket; 284 + short cpus_in_uvhub; 285 + unsigned short message_number; 286 + unsigned short uvhub_quiesce; 287 + short socket_acknowledge_count[DEST_Q_SIZE]; 288 + cycles_t send_message; 289 + spinlock_t masks_lock; 290 + spinlock_t uvhub_lock; 291 + spinlock_t queue_lock; 328 292 }; 329 293 330 294 /* 331 295 * This structure is allocated per_cpu for UV TLB shootdown statistics. 332 296 */ 333 297 struct ptc_stats { 334 - unsigned long ptc_i; /* number of IPI-style flushes */ 335 - unsigned long requestor; /* number of nodes this cpu sent to */ 336 - unsigned long requestee; /* times cpu was remotely requested */ 337 - unsigned long alltlb; /* times all tlb's on this cpu were flushed */ 338 - unsigned long onetlb; /* times just one tlb on this cpu was flushed */ 339 - unsigned long s_retry; /* retries on source side timeouts */ 340 - unsigned long d_retry; /* retries on destination side timeouts */ 341 - unsigned long sflush; /* cycles spent in uv_flush_tlb_others */ 342 - unsigned long dflush; /* cycles spent on destination side */ 343 - unsigned long retriesok; /* successes on retries */ 344 - unsigned long nomsg; /* interrupts with no message */ 345 - unsigned long multmsg; /* interrupts with multiple messages */ 346 - unsigned long ntargeted;/* nodes targeted */ 298 + /* sender statistics */ 299 + unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ 300 + unsigned long s_requestor; /* number of shootdown requests */ 301 + unsigned long s_stimeout; /* source side timeouts */ 302 + unsigned long s_dtimeout; /* destination side timeouts */ 303 + unsigned long s_time; /* time spent in sending side */ 304 + unsigned long s_retriesok; /* successful retries */ 305 + unsigned long s_ntargcpu; /* number of cpus targeted */ 306 + unsigned long s_ntarguvhub; /* number of uvhubs targeted */ 307 + unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */ 308 + unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */ 309 + unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */ 310 + unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */ 311 + unsigned long s_ntarguvhub1; /* number of times == 1 target hub */ 312 + unsigned long s_resets_plug; /* ipi-style resets from plug state */ 313 + unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ 314 + unsigned long s_busy; /* status stayed busy past s/w timer */ 315 + unsigned long s_throttles; /* waits in throttle */ 316 + unsigned long s_retry_messages; /* retry broadcasts */ 317 + /* destination statistics */ 318 + unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 319 + unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ 320 + unsigned long d_multmsg; /* interrupts with multiple messages */ 321 + unsigned long d_nomsg; /* interrupts with no message */ 322 + unsigned long d_time; /* time spent on destination side */ 323 + unsigned long d_requestee; /* number of messages processed */ 324 + unsigned long d_retries; /* number of retry messages processed */ 325 + unsigned long d_canceled; /* number of messages canceled by retries */ 326 + unsigned long d_nocanceled; /* retries that found nothing to cancel */ 327 + unsigned long d_resets; /* number of ipi-style requests processed */ 328 + unsigned long d_rcanceled; /* number of messages canceled by resets */ 347 329 }; 348 330 349 - static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) 331 + static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) 350 332 { 351 - return constant_test_bit(node, &dstp->bits[0]); 333 + return constant_test_bit(uvhub, &dstp->bits[0]); 352 334 } 353 - static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) 335 + static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) 354 336 { 355 - __set_bit(node, &dstp->bits[0]); 337 + __set_bit(uvhub, &dstp->bits[0]); 356 338 } 357 - static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) 339 + static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, 340 + int nbits) 358 341 { 359 342 bitmap_zero(&dstp->bits[0], nbits); 343 + } 344 + static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp) 345 + { 346 + return bitmap_weight((unsigned long *)&dstp->bits[0], 347 + UV_DISTRIBUTION_SIZE); 360 348 } 361 349 362 350 static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) ··· 399 327 400 328 extern void uv_bau_message_intr1(void); 401 329 extern void uv_bau_timeout_intr1(void); 330 + 331 + struct atomic_short { 332 + short counter; 333 + }; 334 + 335 + /** 336 + * atomic_read_short - read a short atomic variable 337 + * @v: pointer of type atomic_short 338 + * 339 + * Atomically reads the value of @v. 340 + */ 341 + static inline int atomic_read_short(const struct atomic_short *v) 342 + { 343 + return v->counter; 344 + } 345 + 346 + /** 347 + * atomic_add_short_return - add and return a short int 348 + * @i: short value to add 349 + * @v: pointer of type atomic_short 350 + * 351 + * Atomically adds @i to @v and returns @i + @v 352 + */ 353 + static inline int atomic_add_short_return(short i, struct atomic_short *v) 354 + { 355 + short __i = i; 356 + asm volatile(LOCK_PREFIX "xaddw %0, %1" 357 + : "+r" (i), "+m" (v->counter) 358 + : : "memory"); 359 + return i + __i; 360 + } 402 361 403 362 #endif /* _ASM_X86_UV_UV_BAU_H */
+910 -380
arch/x86/kernel/tlb_uv.c
··· 1 1 /* 2 2 * SGI UltraViolet TLB flush routines. 3 3 * 4 - * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. 4 + * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. 5 5 * 6 6 * This code is released under the GNU General Public License version 2 or 7 7 * later. ··· 19 19 #include <asm/idle.h> 20 20 #include <asm/tsc.h> 21 21 #include <asm/irq_vectors.h> 22 + #include <asm/timer.h> 23 + 24 + struct msg_desc { 25 + struct bau_payload_queue_entry *msg; 26 + int msg_slot; 27 + int sw_ack_slot; 28 + struct bau_payload_queue_entry *va_queue_first; 29 + struct bau_payload_queue_entry *va_queue_last; 30 + }; 22 31 23 32 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL 24 33 25 - static struct bau_control **uv_bau_table_bases __read_mostly; 26 - static int uv_bau_retry_limit __read_mostly; 34 + static int uv_bau_max_concurrent __read_mostly; 35 + 36 + static int nobau; 37 + static int __init setup_nobau(char *arg) 38 + { 39 + nobau = 1; 40 + return 0; 41 + } 42 + early_param("nobau", setup_nobau); 27 43 28 44 /* base pnode in this partition */ 29 - static int uv_partition_base_pnode __read_mostly; 30 - 31 - static unsigned long uv_mmask __read_mostly; 45 + static int uv_partition_base_pnode __read_mostly; 46 + /* position of pnode (which is nasid>>1): */ 47 + static int uv_nshift __read_mostly; 48 + static unsigned long uv_mmask __read_mostly; 32 49 33 50 static DEFINE_PER_CPU(struct ptc_stats, ptcstats); 34 51 static DEFINE_PER_CPU(struct bau_control, bau_control); 52 + static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 53 + 54 + struct reset_args { 55 + int sender; 56 + }; 35 57 36 58 /* 37 - * Determine the first node on a blade. 59 + * Determine the first node on a uvhub. 'Nodes' are used for kernel 60 + * memory allocation. 38 61 */ 39 - static int __init blade_to_first_node(int blade) 62 + static int __init uvhub_to_first_node(int uvhub) 40 63 { 41 64 int node, b; 42 65 43 66 for_each_online_node(node) { 44 67 b = uv_node_to_blade_id(node); 45 - if (blade == b) 68 + if (uvhub == b) 46 69 return node; 47 70 } 48 - return -1; /* shouldn't happen */ 71 + return -1; 49 72 } 50 73 51 74 /* 52 - * Determine the apicid of the first cpu on a blade. 75 + * Determine the apicid of the first cpu on a uvhub. 53 76 */ 54 - static int __init blade_to_first_apicid(int blade) 77 + static int __init uvhub_to_first_apicid(int uvhub) 55 78 { 56 79 int cpu; 57 80 58 81 for_each_present_cpu(cpu) 59 - if (blade == uv_cpu_to_blade_id(cpu)) 82 + if (uvhub == uv_cpu_to_blade_id(cpu)) 60 83 return per_cpu(x86_cpu_to_apicid, cpu); 61 84 return -1; 62 85 } ··· 92 69 * clear of the Timeout bit (as well) will free the resource. No reply will 93 70 * be sent (the hardware will only do one reply per message). 94 71 */ 95 - static void uv_reply_to_message(int resource, 96 - struct bau_payload_queue_entry *msg, 97 - struct bau_msg_status *msp) 72 + static inline void uv_reply_to_message(struct msg_desc *mdp, 73 + struct bau_control *bcp) 98 74 { 99 75 unsigned long dw; 76 + struct bau_payload_queue_entry *msg; 100 77 101 - dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); 78 + msg = mdp->msg; 79 + if (!msg->canceled) { 80 + dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | 81 + msg->sw_ack_vector; 82 + uv_write_local_mmr( 83 + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); 84 + } 102 85 msg->replied_to = 1; 103 86 msg->sw_ack_vector = 0; 104 - if (msp) 105 - msp->seen_by.bits = 0; 106 - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); 87 + } 88 + 89 + /* 90 + * Process the receipt of a RETRY message 91 + */ 92 + static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, 93 + struct bau_control *bcp) 94 + { 95 + int i; 96 + int cancel_count = 0; 97 + int slot2; 98 + unsigned long msg_res; 99 + unsigned long mmr = 0; 100 + struct bau_payload_queue_entry *msg; 101 + struct bau_payload_queue_entry *msg2; 102 + struct ptc_stats *stat; 103 + 104 + msg = mdp->msg; 105 + stat = &per_cpu(ptcstats, bcp->cpu); 106 + stat->d_retries++; 107 + /* 108 + * cancel any message from msg+1 to the retry itself 109 + */ 110 + for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { 111 + if (msg2 > mdp->va_queue_last) 112 + msg2 = mdp->va_queue_first; 113 + if (msg2 == msg) 114 + break; 115 + 116 + /* same conditions for cancellation as uv_do_reset */ 117 + if ((msg2->replied_to == 0) && (msg2->canceled == 0) && 118 + (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & 119 + msg->sw_ack_vector) == 0) && 120 + (msg2->sending_cpu == msg->sending_cpu) && 121 + (msg2->msg_type != MSG_NOOP)) { 122 + slot2 = msg2 - mdp->va_queue_first; 123 + mmr = uv_read_local_mmr 124 + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 125 + msg_res = ((msg2->sw_ack_vector << 8) | 126 + msg2->sw_ack_vector); 127 + /* 128 + * This is a message retry; clear the resources held 129 + * by the previous message only if they timed out. 130 + * If it has not timed out we have an unexpected 131 + * situation to report. 132 + */ 133 + if (mmr & (msg_res << 8)) { 134 + /* 135 + * is the resource timed out? 136 + * make everyone ignore the cancelled message. 137 + */ 138 + msg2->canceled = 1; 139 + stat->d_canceled++; 140 + cancel_count++; 141 + uv_write_local_mmr( 142 + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 143 + (msg_res << 8) | msg_res); 144 + } else 145 + printk(KERN_INFO "note bau retry: no effect\n"); 146 + } 147 + } 148 + if (!cancel_count) 149 + stat->d_nocanceled++; 107 150 } 108 151 109 152 /* 110 153 * Do all the things a cpu should do for a TLB shootdown message. 111 154 * Other cpu's may come here at the same time for this message. 112 155 */ 113 - static void uv_bau_process_message(struct bau_payload_queue_entry *msg, 114 - int msg_slot, int sw_ack_slot) 156 + static void uv_bau_process_message(struct msg_desc *mdp, 157 + struct bau_control *bcp) 115 158 { 116 - unsigned long this_cpu_mask; 117 - struct bau_msg_status *msp; 118 - int cpu; 159 + int msg_ack_count; 160 + short socket_ack_count = 0; 161 + struct ptc_stats *stat; 162 + struct bau_payload_queue_entry *msg; 163 + struct bau_control *smaster = bcp->socket_master; 119 164 120 - msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; 121 - cpu = uv_blade_processor_id(); 122 - msg->number_of_cpus = 123 - uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); 124 - this_cpu_mask = 1UL << cpu; 125 - if (msp->seen_by.bits & this_cpu_mask) 126 - return; 127 - atomic_or_long(&msp->seen_by.bits, this_cpu_mask); 128 - 129 - if (msg->replied_to == 1) 130 - return; 131 - 165 + /* 166 + * This must be a normal message, or retry of a normal message 167 + */ 168 + msg = mdp->msg; 169 + stat = &per_cpu(ptcstats, bcp->cpu); 132 170 if (msg->address == TLB_FLUSH_ALL) { 133 171 local_flush_tlb(); 134 - __get_cpu_var(ptcstats).alltlb++; 172 + stat->d_alltlb++; 135 173 } else { 136 174 __flush_tlb_one(msg->address); 137 - __get_cpu_var(ptcstats).onetlb++; 175 + stat->d_onetlb++; 138 176 } 177 + stat->d_requestee++; 139 178 140 - __get_cpu_var(ptcstats).requestee++; 179 + /* 180 + * One cpu on each uvhub has the additional job on a RETRY 181 + * of releasing the resource held by the message that is 182 + * being retried. That message is identified by sending 183 + * cpu number. 184 + */ 185 + if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) 186 + uv_bau_process_retry_msg(mdp, bcp); 141 187 142 - atomic_inc_short(&msg->acknowledge_count); 143 - if (msg->number_of_cpus == msg->acknowledge_count) 144 - uv_reply_to_message(sw_ack_slot, msg, msp); 145 - } 188 + /* 189 + * This is a sw_ack message, so we have to reply to it. 190 + * Count each responding cpu on the socket. This avoids 191 + * pinging the count's cache line back and forth between 192 + * the sockets. 193 + */ 194 + socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) 195 + &smaster->socket_acknowledge_count[mdp->msg_slot]); 196 + if (socket_ack_count == bcp->cpus_in_socket) { 197 + /* 198 + * Both sockets dump their completed count total into 199 + * the message's count. 200 + */ 201 + smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 202 + msg_ack_count = atomic_add_short_return(socket_ack_count, 203 + (struct atomic_short *)&msg->acknowledge_count); 146 204 147 - /* 148 - * Examine the payload queue on one distribution node to see 149 - * which messages have not been seen, and which cpu(s) have not seen them. 150 - * 151 - * Returns the number of cpu's that have not responded. 152 - */ 153 - static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) 154 - { 155 - struct bau_payload_queue_entry *msg; 156 - struct bau_msg_status *msp; 157 - int count = 0; 158 - int i; 159 - int j; 160 - 161 - for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; 162 - msg++, i++) { 163 - if ((msg->sending_cpu == sender) && (!msg->replied_to)) { 164 - msp = bau_tablesp->msg_statuses + i; 165 - printk(KERN_DEBUG 166 - "blade %d: address:%#lx %d of %d, not cpu(s): ", 167 - i, msg->address, msg->acknowledge_count, 168 - msg->number_of_cpus); 169 - for (j = 0; j < msg->number_of_cpus; j++) { 170 - if (!((1L << j) & msp->seen_by.bits)) { 171 - count++; 172 - printk("%d ", j); 173 - } 174 - } 175 - printk("\n"); 205 + if (msg_ack_count == bcp->cpus_in_uvhub) { 206 + /* 207 + * All cpus in uvhub saw it; reply 208 + */ 209 + uv_reply_to_message(mdp, bcp); 176 210 } 177 211 } 178 - return count; 212 + 213 + return; 179 214 } 180 215 181 216 /* 182 - * Examine the payload queue on all the distribution nodes to see 183 - * which messages have not been seen, and which cpu(s) have not seen them. 184 - * 185 - * Returns the number of cpu's that have not responded. 217 + * Determine the first cpu on a uvhub. 186 218 */ 187 - static int uv_examine_destinations(struct bau_target_nodemask *distribution) 219 + static int uvhub_to_first_cpu(int uvhub) 188 220 { 189 - int sender; 190 - int i; 191 - int count = 0; 192 - 193 - sender = smp_processor_id(); 194 - for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { 195 - if (!bau_node_isset(i, distribution)) 196 - continue; 197 - count += uv_examine_destination(uv_bau_table_bases[i], sender); 198 - } 199 - return count; 221 + int cpu; 222 + for_each_present_cpu(cpu) 223 + if (uvhub == uv_cpu_to_blade_id(cpu)) 224 + return cpu; 225 + return -1; 200 226 } 201 227 202 228 /* 203 - * wait for completion of a broadcast message 229 + * Last resort when we get a large number of destination timeouts is 230 + * to clear resources held by a given cpu. 231 + * Do this with IPI so that all messages in the BAU message queue 232 + * can be identified by their nonzero sw_ack_vector field. 204 233 * 205 - * return COMPLETE, RETRY or GIVEUP 234 + * This is entered for a single cpu on the uvhub. 235 + * The sender want's this uvhub to free a specific message's 236 + * sw_ack resources. 237 + */ 238 + static void 239 + uv_do_reset(void *ptr) 240 + { 241 + int i; 242 + int slot; 243 + int count = 0; 244 + unsigned long mmr; 245 + unsigned long msg_res; 246 + struct bau_control *bcp; 247 + struct reset_args *rap; 248 + struct bau_payload_queue_entry *msg; 249 + struct ptc_stats *stat; 250 + 251 + bcp = &per_cpu(bau_control, smp_processor_id()); 252 + rap = (struct reset_args *)ptr; 253 + stat = &per_cpu(ptcstats, bcp->cpu); 254 + stat->d_resets++; 255 + 256 + /* 257 + * We're looking for the given sender, and 258 + * will free its sw_ack resource. 259 + * If all cpu's finally responded after the timeout, its 260 + * message 'replied_to' was set. 261 + */ 262 + for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { 263 + /* uv_do_reset: same conditions for cancellation as 264 + uv_bau_process_retry_msg() */ 265 + if ((msg->replied_to == 0) && 266 + (msg->canceled == 0) && 267 + (msg->sending_cpu == rap->sender) && 268 + (msg->sw_ack_vector) && 269 + (msg->msg_type != MSG_NOOP)) { 270 + /* 271 + * make everyone else ignore this message 272 + */ 273 + msg->canceled = 1; 274 + slot = msg - bcp->va_queue_first; 275 + count++; 276 + /* 277 + * only reset the resource if it is still pending 278 + */ 279 + mmr = uv_read_local_mmr 280 + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 281 + msg_res = ((msg->sw_ack_vector << 8) | 282 + msg->sw_ack_vector); 283 + if (mmr & msg_res) { 284 + stat->d_rcanceled++; 285 + uv_write_local_mmr( 286 + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 287 + msg_res); 288 + } 289 + } 290 + } 291 + return; 292 + } 293 + 294 + /* 295 + * Use IPI to get all target uvhubs to release resources held by 296 + * a given sending cpu number. 297 + */ 298 + static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, 299 + int sender) 300 + { 301 + int uvhub; 302 + int cpu; 303 + cpumask_t mask; 304 + struct reset_args reset_args; 305 + 306 + reset_args.sender = sender; 307 + 308 + cpus_clear(mask); 309 + /* find a single cpu for each uvhub in this distribution mask */ 310 + for (uvhub = 0; 311 + uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; 312 + uvhub++) { 313 + if (!bau_uvhub_isset(uvhub, distribution)) 314 + continue; 315 + /* find a cpu for this uvhub */ 316 + cpu = uvhub_to_first_cpu(uvhub); 317 + cpu_set(cpu, mask); 318 + } 319 + /* IPI all cpus; Preemption is already disabled */ 320 + smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); 321 + return; 322 + } 323 + 324 + static inline unsigned long 325 + cycles_2_us(unsigned long long cyc) 326 + { 327 + unsigned long long ns; 328 + unsigned long us; 329 + ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) 330 + >> CYC2NS_SCALE_FACTOR; 331 + us = ns / 1000; 332 + return us; 333 + } 334 + 335 + /* 336 + * wait for all cpus on this hub to finish their sends and go quiet 337 + * leaves uvhub_quiesce set so that no new broadcasts are started by 338 + * bau_flush_send_and_wait() 339 + */ 340 + static inline void 341 + quiesce_local_uvhub(struct bau_control *hmaster) 342 + { 343 + atomic_add_short_return(1, (struct atomic_short *) 344 + &hmaster->uvhub_quiesce); 345 + } 346 + 347 + /* 348 + * mark this quiet-requestor as done 349 + */ 350 + static inline void 351 + end_uvhub_quiesce(struct bau_control *hmaster) 352 + { 353 + atomic_add_short_return(-1, (struct atomic_short *) 354 + &hmaster->uvhub_quiesce); 355 + } 356 + 357 + /* 358 + * Wait for completion of a broadcast software ack message 359 + * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP 206 360 */ 207 361 static int uv_wait_completion(struct bau_desc *bau_desc, 208 - unsigned long mmr_offset, int right_shift) 362 + unsigned long mmr_offset, int right_shift, int this_cpu, 363 + struct bau_control *bcp, struct bau_control *smaster, long try) 209 364 { 210 - int exams = 0; 211 - long destination_timeouts = 0; 212 - long source_timeouts = 0; 365 + int relaxes = 0; 213 366 unsigned long descriptor_status; 367 + unsigned long mmr; 368 + unsigned long mask; 369 + cycles_t ttime; 370 + cycles_t timeout_time; 371 + struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); 372 + struct bau_control *hmaster; 214 373 374 + hmaster = bcp->uvhub_master; 375 + timeout_time = get_cycles() + bcp->timeout_interval; 376 + 377 + /* spin on the status MMR, waiting for it to go idle */ 215 378 while ((descriptor_status = (((unsigned long) 216 379 uv_read_local_mmr(mmr_offset) >> 217 380 right_shift) & UV_ACT_STATUS_MASK)) != 218 381 DESC_STATUS_IDLE) { 219 - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { 220 - source_timeouts++; 221 - if (source_timeouts > SOURCE_TIMEOUT_LIMIT) 222 - source_timeouts = 0; 223 - __get_cpu_var(ptcstats).s_retry++; 224 - return FLUSH_RETRY; 225 - } 226 382 /* 227 - * spin here looking for progress at the destinations 383 + * Our software ack messages may be blocked because there are 384 + * no swack resources available. As long as none of them 385 + * has timed out hardware will NACK our message and its 386 + * state will stay IDLE. 228 387 */ 229 - if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { 230 - destination_timeouts++; 231 - if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { 232 - /* 233 - * returns number of cpus not responding 234 - */ 235 - if (uv_examine_destinations 236 - (&bau_desc->distribution) == 0) { 237 - __get_cpu_var(ptcstats).d_retry++; 238 - return FLUSH_RETRY; 239 - } 240 - exams++; 241 - if (exams >= uv_bau_retry_limit) { 242 - printk(KERN_DEBUG 243 - "uv_flush_tlb_others"); 244 - printk("giving up on cpu %d\n", 245 - smp_processor_id()); 388 + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { 389 + stat->s_stimeout++; 390 + return FLUSH_GIVEUP; 391 + } else if (descriptor_status == 392 + DESC_STATUS_DESTINATION_TIMEOUT) { 393 + stat->s_dtimeout++; 394 + ttime = get_cycles(); 395 + 396 + /* 397 + * Our retries may be blocked by all destination 398 + * swack resources being consumed, and a timeout 399 + * pending. In that case hardware returns the 400 + * ERROR that looks like a destination timeout. 401 + */ 402 + if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { 403 + bcp->conseccompletes = 0; 404 + return FLUSH_RETRY_PLUGGED; 405 + } 406 + 407 + bcp->conseccompletes = 0; 408 + return FLUSH_RETRY_TIMEOUT; 409 + } else { 410 + /* 411 + * descriptor_status is still BUSY 412 + */ 413 + cpu_relax(); 414 + relaxes++; 415 + if (relaxes >= 10000) { 416 + relaxes = 0; 417 + if (get_cycles() > timeout_time) { 418 + quiesce_local_uvhub(hmaster); 419 + 420 + /* single-thread the register change */ 421 + spin_lock(&hmaster->masks_lock); 422 + mmr = uv_read_local_mmr(mmr_offset); 423 + mask = 0UL; 424 + mask |= (3UL < right_shift); 425 + mask = ~mask; 426 + mmr &= mask; 427 + uv_write_local_mmr(mmr_offset, mmr); 428 + spin_unlock(&hmaster->masks_lock); 429 + end_uvhub_quiesce(hmaster); 430 + stat->s_busy++; 246 431 return FLUSH_GIVEUP; 247 432 } 248 - /* 249 - * delays can hang the simulator 250 - udelay(1000); 251 - */ 252 - destination_timeouts = 0; 253 433 } 254 434 } 255 - cpu_relax(); 256 435 } 436 + bcp->conseccompletes++; 257 437 return FLUSH_COMPLETE; 438 + } 439 + 440 + static inline cycles_t 441 + sec_2_cycles(unsigned long sec) 442 + { 443 + unsigned long ns; 444 + cycles_t cyc; 445 + 446 + ns = sec * 1000000000; 447 + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 448 + return cyc; 449 + } 450 + 451 + /* 452 + * conditionally add 1 to *v, unless *v is >= u 453 + * return 0 if we cannot add 1 to *v because it is >= u 454 + * return 1 if we can add 1 to *v because it is < u 455 + * the add is atomic 456 + * 457 + * This is close to atomic_add_unless(), but this allows the 'u' value 458 + * to be lowered below the current 'v'. atomic_add_unless can only stop 459 + * on equal. 460 + */ 461 + static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) 462 + { 463 + spin_lock(lock); 464 + if (atomic_read(v) >= u) { 465 + spin_unlock(lock); 466 + return 0; 467 + } 468 + atomic_inc(v); 469 + spin_unlock(lock); 470 + return 1; 258 471 } 259 472 260 473 /** 261 474 * uv_flush_send_and_wait 262 475 * 263 - * Send a broadcast and wait for a broadcast message to complete. 476 + * Send a broadcast and wait for it to complete. 264 477 * 265 - * The flush_mask contains the cpus the broadcast was sent to. 478 + * The flush_mask contains the cpus the broadcast is to be sent to, plus 479 + * cpus that are on the local uvhub. 266 480 * 267 - * Returns NULL if all remote flushing was done. The mask is zeroed. 481 + * Returns NULL if all flushing represented in the mask was done. The mask 482 + * is zeroed. 268 483 * Returns @flush_mask if some remote flushing remains to be done. The 269 - * mask will have some bits still set. 484 + * mask will have some bits still set, representing any cpus on the local 485 + * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. 270 486 */ 271 - const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, 272 - struct bau_desc *bau_desc, 273 - struct cpumask *flush_mask) 487 + const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, 488 + struct cpumask *flush_mask, 489 + struct bau_control *bcp) 274 490 { 275 - int completion_status = 0; 276 491 int right_shift; 277 - int tries = 0; 278 - int pnode; 492 + int uvhub; 279 493 int bit; 494 + int completion_status = 0; 495 + int seq_number = 0; 496 + long try = 0; 497 + int cpu = bcp->uvhub_cpu; 498 + int this_cpu = bcp->cpu; 499 + int this_uvhub = bcp->uvhub; 280 500 unsigned long mmr_offset; 281 501 unsigned long index; 282 502 cycles_t time1; 283 503 cycles_t time2; 504 + struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); 505 + struct bau_control *smaster = bcp->socket_master; 506 + struct bau_control *hmaster = bcp->uvhub_master; 507 + 508 + /* 509 + * Spin here while there are hmaster->max_concurrent or more active 510 + * descriptors. This is the per-uvhub 'throttle'. 511 + */ 512 + if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 513 + &hmaster->active_descriptor_count, 514 + hmaster->max_concurrent)) { 515 + stat->s_throttles++; 516 + do { 517 + cpu_relax(); 518 + } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 519 + &hmaster->active_descriptor_count, 520 + hmaster->max_concurrent)); 521 + } 522 + 523 + while (hmaster->uvhub_quiesce) 524 + cpu_relax(); 284 525 285 526 if (cpu < UV_CPUS_PER_ACT_STATUS) { 286 527 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; ··· 556 269 } 557 270 time1 = get_cycles(); 558 271 do { 559 - tries++; 272 + /* 273 + * Every message from any given cpu gets a unique message 274 + * sequence number. But retries use that same number. 275 + * Our message may have timed out at the destination because 276 + * all sw-ack resources are in use and there is a timeout 277 + * pending there. In that case, our last send never got 278 + * placed into the queue and we need to persist until it 279 + * does. 280 + * 281 + * Make any retry a type MSG_RETRY so that the destination will 282 + * free any resource held by a previous message from this cpu. 283 + */ 284 + if (try == 0) { 285 + /* use message type set by the caller the first time */ 286 + seq_number = bcp->message_number++; 287 + } else { 288 + /* use RETRY type on all the rest; same sequence */ 289 + bau_desc->header.msg_type = MSG_RETRY; 290 + stat->s_retry_messages++; 291 + } 292 + bau_desc->header.sequence = seq_number; 560 293 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 561 - cpu; 562 - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 563 - completion_status = uv_wait_completion(bau_desc, mmr_offset, 564 - right_shift); 565 - } while (completion_status == FLUSH_RETRY); 566 - time2 = get_cycles(); 567 - __get_cpu_var(ptcstats).sflush += (time2 - time1); 568 - if (tries > 1) 569 - __get_cpu_var(ptcstats).retriesok++; 294 + bcp->uvhub_cpu; 295 + bcp->send_message = get_cycles(); 570 296 571 - if (completion_status == FLUSH_GIVEUP) { 297 + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 298 + 299 + try++; 300 + completion_status = uv_wait_completion(bau_desc, mmr_offset, 301 + right_shift, this_cpu, bcp, smaster, try); 302 + 303 + if (completion_status == FLUSH_RETRY_PLUGGED) { 304 + /* 305 + * Our retries may be blocked by all destination swack 306 + * resources being consumed, and a timeout pending. In 307 + * that case hardware immediately returns the ERROR 308 + * that looks like a destination timeout. 309 + */ 310 + udelay(TIMEOUT_DELAY); 311 + bcp->plugged_tries++; 312 + if (bcp->plugged_tries >= PLUGSB4RESET) { 313 + bcp->plugged_tries = 0; 314 + quiesce_local_uvhub(hmaster); 315 + spin_lock(&hmaster->queue_lock); 316 + uv_reset_with_ipi(&bau_desc->distribution, 317 + this_cpu); 318 + spin_unlock(&hmaster->queue_lock); 319 + end_uvhub_quiesce(hmaster); 320 + bcp->ipi_attempts++; 321 + stat->s_resets_plug++; 322 + } 323 + } else if (completion_status == FLUSH_RETRY_TIMEOUT) { 324 + hmaster->max_concurrent = 1; 325 + bcp->timeout_tries++; 326 + udelay(TIMEOUT_DELAY); 327 + if (bcp->timeout_tries >= TIMEOUTSB4RESET) { 328 + bcp->timeout_tries = 0; 329 + quiesce_local_uvhub(hmaster); 330 + spin_lock(&hmaster->queue_lock); 331 + uv_reset_with_ipi(&bau_desc->distribution, 332 + this_cpu); 333 + spin_unlock(&hmaster->queue_lock); 334 + end_uvhub_quiesce(hmaster); 335 + bcp->ipi_attempts++; 336 + stat->s_resets_timeout++; 337 + } 338 + } 339 + if (bcp->ipi_attempts >= 3) { 340 + bcp->ipi_attempts = 0; 341 + completion_status = FLUSH_GIVEUP; 342 + break; 343 + } 344 + cpu_relax(); 345 + } while ((completion_status == FLUSH_RETRY_PLUGGED) || 346 + (completion_status == FLUSH_RETRY_TIMEOUT)); 347 + time2 = get_cycles(); 348 + 349 + if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) 350 + && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) 351 + hmaster->max_concurrent++; 352 + 353 + /* 354 + * hold any cpu not timing out here; no other cpu currently held by 355 + * the 'throttle' should enter the activation code 356 + */ 357 + while (hmaster->uvhub_quiesce) 358 + cpu_relax(); 359 + atomic_dec(&hmaster->active_descriptor_count); 360 + 361 + /* guard against cycles wrap */ 362 + if (time2 > time1) 363 + stat->s_time += (time2 - time1); 364 + else 365 + stat->s_requestor--; /* don't count this one */ 366 + if (completion_status == FLUSH_COMPLETE && try > 1) 367 + stat->s_retriesok++; 368 + else if (completion_status == FLUSH_GIVEUP) { 572 369 /* 573 370 * Cause the caller to do an IPI-style TLB shootdown on 574 - * the cpu's, all of which are still in the mask. 371 + * the target cpu's, all of which are still in the mask. 575 372 */ 576 - __get_cpu_var(ptcstats).ptc_i++; 373 + stat->s_giveup++; 577 374 return flush_mask; 578 375 } 579 376 ··· 666 295 * use the IPI method of shootdown on them. 667 296 */ 668 297 for_each_cpu(bit, flush_mask) { 669 - pnode = uv_cpu_to_pnode(bit); 670 - if (pnode == this_pnode) 298 + uvhub = uv_cpu_to_blade_id(bit); 299 + if (uvhub == this_uvhub) 671 300 continue; 672 301 cpumask_clear_cpu(bit, flush_mask); 673 302 } 674 303 if (!cpumask_empty(flush_mask)) 675 304 return flush_mask; 305 + 676 306 return NULL; 677 307 } 678 - 679 - static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 680 308 681 309 /** 682 310 * uv_flush_tlb_others - globally purge translation cache of a virtual ··· 693 323 * The caller has derived the cpumask from the mm_struct. This function 694 324 * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) 695 325 * 696 - * The cpumask is converted into a nodemask of the nodes containing 697 - * the cpus. 326 + * The cpumask is converted into a uvhubmask of the uvhubs containing 327 + * those cpus. 698 328 * 699 329 * Note that this function should be called with preemption disabled. 700 330 * ··· 706 336 struct mm_struct *mm, 707 337 unsigned long va, unsigned int cpu) 708 338 { 709 - struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); 710 - int i; 711 - int bit; 712 - int pnode; 713 - int uv_cpu; 714 - int this_pnode; 339 + int remotes; 340 + int tcpu; 341 + int uvhub; 715 342 int locals = 0; 716 343 struct bau_desc *bau_desc; 344 + struct cpumask *flush_mask; 345 + struct ptc_stats *stat; 346 + struct bau_control *bcp; 717 347 348 + if (nobau) 349 + return cpumask; 350 + 351 + bcp = &per_cpu(bau_control, cpu); 352 + /* 353 + * Each sending cpu has a per-cpu mask which it fills from the caller's 354 + * cpu mask. Only remote cpus are converted to uvhubs and copied. 355 + */ 356 + flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 357 + /* 358 + * copy cpumask to flush_mask, removing current cpu 359 + * (current cpu should already have been flushed by the caller and 360 + * should never be returned if we return flush_mask) 361 + */ 718 362 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 363 + if (cpu_isset(cpu, *cpumask)) 364 + locals++; /* current cpu was targeted */ 719 365 720 - uv_cpu = uv_blade_processor_id(); 721 - this_pnode = uv_hub_info->pnode; 722 - bau_desc = __get_cpu_var(bau_control).descriptor_base; 723 - bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; 366 + bau_desc = bcp->descriptor_base; 367 + bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 724 368 725 - bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 726 - 727 - i = 0; 728 - for_each_cpu(bit, flush_mask) { 729 - pnode = uv_cpu_to_pnode(bit); 730 - BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); 731 - if (pnode == this_pnode) { 369 + bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 370 + remotes = 0; 371 + for_each_cpu(tcpu, flush_mask) { 372 + uvhub = uv_cpu_to_blade_id(tcpu); 373 + if (uvhub == bcp->uvhub) { 732 374 locals++; 733 375 continue; 734 376 } 735 - bau_node_set(pnode - uv_partition_base_pnode, 736 - &bau_desc->distribution); 737 - i++; 377 + bau_uvhub_set(uvhub, &bau_desc->distribution); 378 + remotes++; 738 379 } 739 - if (i == 0) { 380 + if (remotes == 0) { 740 381 /* 741 - * no off_node flushing; return status for local node 382 + * No off_hub flushing; return status for local hub. 383 + * Return the caller's mask if all were local (the current 384 + * cpu may be in that mask). 742 385 */ 743 386 if (locals) 744 - return flush_mask; 387 + return cpumask; 745 388 else 746 389 return NULL; 747 390 } 748 - __get_cpu_var(ptcstats).requestor++; 749 - __get_cpu_var(ptcstats).ntargeted += i; 391 + stat = &per_cpu(ptcstats, cpu); 392 + stat->s_requestor++; 393 + stat->s_ntargcpu += remotes; 394 + remotes = bau_uvhub_weight(&bau_desc->distribution); 395 + stat->s_ntarguvhub += remotes; 396 + if (remotes >= 16) 397 + stat->s_ntarguvhub16++; 398 + else if (remotes >= 8) 399 + stat->s_ntarguvhub8++; 400 + else if (remotes >= 4) 401 + stat->s_ntarguvhub4++; 402 + else if (remotes >= 2) 403 + stat->s_ntarguvhub2++; 404 + else 405 + stat->s_ntarguvhub1++; 750 406 751 407 bau_desc->payload.address = va; 752 408 bau_desc->payload.sending_cpu = cpu; 753 409 754 - return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); 410 + /* 411 + * uv_flush_send_and_wait returns null if all cpu's were messaged, or 412 + * the adjusted flush_mask if any cpu's were not messaged. 413 + */ 414 + return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); 755 415 } 756 416 757 417 /* ··· 790 390 * 791 391 * We received a broadcast assist message. 792 392 * 793 - * Interrupts may have been disabled; this interrupt could represent 393 + * Interrupts are disabled; this interrupt could represent 794 394 * the receipt of several messages. 795 395 * 796 - * All cores/threads on this node get this interrupt. 797 - * The last one to see it does the s/w ack. 396 + * All cores/threads on this hub get this interrupt. 397 + * The last one to see it does the software ack. 798 398 * (the resource will not be freed until noninterruptable cpus see this 799 - * interrupt; hardware will timeout the s/w ack and reply ERROR) 399 + * interrupt; hardware may timeout the s/w ack and reply ERROR) 800 400 */ 801 401 void uv_bau_message_interrupt(struct pt_regs *regs) 802 402 { 803 - struct bau_payload_queue_entry *va_queue_first; 804 - struct bau_payload_queue_entry *va_queue_last; 805 - struct bau_payload_queue_entry *msg; 806 - struct pt_regs *old_regs = set_irq_regs(regs); 807 - cycles_t time1; 808 - cycles_t time2; 809 - int msg_slot; 810 - int sw_ack_slot; 811 - int fw; 812 403 int count = 0; 813 - unsigned long local_pnode; 404 + cycles_t time_start; 405 + struct bau_payload_queue_entry *msg; 406 + struct bau_control *bcp; 407 + struct ptc_stats *stat; 408 + struct msg_desc msgdesc; 814 409 815 - ack_APIC_irq(); 816 - exit_idle(); 817 - irq_enter(); 818 - 819 - time1 = get_cycles(); 820 - 821 - local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); 822 - 823 - va_queue_first = __get_cpu_var(bau_control).va_queue_first; 824 - va_queue_last = __get_cpu_var(bau_control).va_queue_last; 825 - 826 - msg = __get_cpu_var(bau_control).bau_msg_head; 410 + time_start = get_cycles(); 411 + bcp = &per_cpu(bau_control, smp_processor_id()); 412 + stat = &per_cpu(ptcstats, smp_processor_id()); 413 + msgdesc.va_queue_first = bcp->va_queue_first; 414 + msgdesc.va_queue_last = bcp->va_queue_last; 415 + msg = bcp->bau_msg_head; 827 416 while (msg->sw_ack_vector) { 828 417 count++; 829 - fw = msg->sw_ack_vector; 830 - msg_slot = msg - va_queue_first; 831 - sw_ack_slot = ffs(fw) - 1; 832 - 833 - uv_bau_process_message(msg, msg_slot, sw_ack_slot); 834 - 418 + msgdesc.msg_slot = msg - msgdesc.va_queue_first; 419 + msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; 420 + msgdesc.msg = msg; 421 + uv_bau_process_message(&msgdesc, bcp); 835 422 msg++; 836 - if (msg > va_queue_last) 837 - msg = va_queue_first; 838 - __get_cpu_var(bau_control).bau_msg_head = msg; 423 + if (msg > msgdesc.va_queue_last) 424 + msg = msgdesc.va_queue_first; 425 + bcp->bau_msg_head = msg; 839 426 } 427 + stat->d_time += (get_cycles() - time_start); 840 428 if (!count) 841 - __get_cpu_var(ptcstats).nomsg++; 429 + stat->d_nomsg++; 842 430 else if (count > 1) 843 - __get_cpu_var(ptcstats).multmsg++; 844 - 845 - time2 = get_cycles(); 846 - __get_cpu_var(ptcstats).dflush += (time2 - time1); 847 - 848 - irq_exit(); 849 - set_irq_regs(old_regs); 431 + stat->d_multmsg++; 432 + ack_APIC_irq(); 850 433 } 851 434 852 435 /* 853 436 * uv_enable_timeouts 854 437 * 855 - * Each target blade (i.e. blades that have cpu's) needs to have 438 + * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have 856 439 * shootdown message timeouts enabled. The timeout does not cause 857 440 * an interrupt, but causes an error message to be returned to 858 441 * the sender. 859 442 */ 860 443 static void uv_enable_timeouts(void) 861 444 { 862 - int blade; 863 - int nblades; 445 + int uvhub; 446 + int nuvhubs; 864 447 int pnode; 865 448 unsigned long mmr_image; 866 449 867 - nblades = uv_num_possible_blades(); 450 + nuvhubs = uv_num_possible_blades(); 868 451 869 - for (blade = 0; blade < nblades; blade++) { 870 - if (!uv_blade_nr_possible_cpus(blade)) 452 + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 453 + if (!uv_blade_nr_possible_cpus(uvhub)) 871 454 continue; 872 455 873 - pnode = uv_blade_to_pnode(blade); 456 + pnode = uv_blade_to_pnode(uvhub); 874 457 mmr_image = 875 458 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); 876 459 /* ··· 906 523 { 907 524 } 908 525 526 + static inline unsigned long long 527 + millisec_2_cycles(unsigned long millisec) 528 + { 529 + unsigned long ns; 530 + unsigned long long cyc; 531 + 532 + ns = millisec * 1000; 533 + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 534 + return cyc; 535 + } 536 + 909 537 /* 910 - * Display the statistics thru /proc 911 - * data points to the cpu number 538 + * Display the statistics thru /proc. 539 + * 'data' points to the cpu number 912 540 */ 913 541 static int uv_ptc_seq_show(struct seq_file *file, void *data) 914 542 { ··· 930 536 931 537 if (!cpu) { 932 538 seq_printf(file, 933 - "# cpu requestor requestee one all sretry dretry ptc_i "); 539 + "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); 934 540 seq_printf(file, 935 - "sw_ack sflush dflush sok dnomsg dmult starget\n"); 541 + "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); 542 + seq_printf(file, 543 + "retries rok resetp resett giveup sto bz throt "); 544 + seq_printf(file, 545 + "sw_ack recv rtime all "); 546 + seq_printf(file, 547 + "one mult none retry canc nocan reset rcan\n"); 936 548 } 937 549 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 938 550 stat = &per_cpu(ptcstats, cpu); 939 - seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", 940 - cpu, stat->requestor, 941 - stat->requestee, stat->onetlb, stat->alltlb, 942 - stat->s_retry, stat->d_retry, stat->ptc_i); 943 - seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", 551 + /* source side statistics */ 552 + seq_printf(file, 553 + "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 554 + cpu, stat->s_requestor, cycles_2_us(stat->s_time), 555 + stat->s_ntarguvhub, stat->s_ntarguvhub16, 556 + stat->s_ntarguvhub8, stat->s_ntarguvhub4, 557 + stat->s_ntarguvhub2, stat->s_ntarguvhub1, 558 + stat->s_ntargcpu, stat->s_dtimeout); 559 + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 560 + stat->s_retry_messages, stat->s_retriesok, 561 + stat->s_resets_plug, stat->s_resets_timeout, 562 + stat->s_giveup, stat->s_stimeout, 563 + stat->s_busy, stat->s_throttles); 564 + /* destination side statistics */ 565 + seq_printf(file, 566 + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", 944 567 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 945 568 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 946 - stat->sflush, stat->dflush, 947 - stat->retriesok, stat->nomsg, 948 - stat->multmsg, stat->ntargeted); 569 + stat->d_requestee, cycles_2_us(stat->d_time), 570 + stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 571 + stat->d_nomsg, stat->d_retries, stat->d_canceled, 572 + stat->d_nocanceled, stat->d_resets, 573 + stat->d_rcanceled); 949 574 } 950 575 951 576 return 0; 952 577 } 953 578 954 579 /* 580 + * -1: resetf the statistics 955 581 * 0: display meaning of the statistics 956 - * >0: retry limit 582 + * >0: maximum concurrent active descriptors per uvhub (throttle) 957 583 */ 958 584 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 959 585 size_t count, loff_t *data) 960 586 { 961 - long newmode; 587 + int cpu; 588 + long input_arg; 962 589 char optstr[64]; 590 + struct ptc_stats *stat; 591 + struct bau_control *bcp; 963 592 964 593 if (count == 0 || count > sizeof(optstr)) 965 594 return -EINVAL; 966 595 if (copy_from_user(optstr, user, count)) 967 596 return -EFAULT; 968 597 optstr[count - 1] = '\0'; 969 - if (strict_strtoul(optstr, 10, &newmode) < 0) { 598 + if (strict_strtol(optstr, 10, &input_arg) < 0) { 970 599 printk(KERN_DEBUG "%s is invalid\n", optstr); 971 600 return -EINVAL; 972 601 } 973 602 974 - if (newmode == 0) { 603 + if (input_arg == 0) { 975 604 printk(KERN_DEBUG "# cpu: cpu number\n"); 605 + printk(KERN_DEBUG "Sender statistics:\n"); 976 606 printk(KERN_DEBUG 977 - "requestor: times this cpu was the flush requestor\n"); 607 + "sent: number of shootdown messages sent\n"); 978 608 printk(KERN_DEBUG 979 - "requestee: times this cpu was requested to flush its TLBs\n"); 609 + "stime: time spent sending messages\n"); 980 610 printk(KERN_DEBUG 981 - "one: times requested to flush a single address\n"); 611 + "numuvhubs: number of hubs targeted with shootdown\n"); 982 612 printk(KERN_DEBUG 983 - "all: times requested to flush all TLB's\n"); 613 + "numuvhubs16: number times 16 or more hubs targeted\n"); 984 614 printk(KERN_DEBUG 985 - "sretry: number of retries of source-side timeouts\n"); 615 + "numuvhubs8: number times 8 or more hubs targeted\n"); 986 616 printk(KERN_DEBUG 987 - "dretry: number of retries of destination-side timeouts\n"); 617 + "numuvhubs4: number times 4 or more hubs targeted\n"); 988 618 printk(KERN_DEBUG 989 - "ptc_i: times UV fell through to IPI-style flushes\n"); 619 + "numuvhubs2: number times 2 or more hubs targeted\n"); 990 620 printk(KERN_DEBUG 991 - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); 621 + "numuvhubs1: number times 1 hub targeted\n"); 992 622 printk(KERN_DEBUG 993 - "sflush_us: cycles spent in uv_flush_tlb_others()\n"); 623 + "numcpus: number of cpus targeted with shootdown\n"); 994 624 printk(KERN_DEBUG 995 - "dflush_us: cycles spent in handling flush requests\n"); 996 - printk(KERN_DEBUG "sok: successes on retry\n"); 997 - printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); 625 + "dto: number of destination timeouts\n"); 998 626 printk(KERN_DEBUG 999 - "dmult: interrupts with multiple messages\n"); 1000 - printk(KERN_DEBUG "starget: nodes targeted\n"); 627 + "retries: destination timeout retries sent\n"); 628 + printk(KERN_DEBUG 629 + "rok: : destination timeouts successfully retried\n"); 630 + printk(KERN_DEBUG 631 + "resetp: ipi-style resource resets for plugs\n"); 632 + printk(KERN_DEBUG 633 + "resett: ipi-style resource resets for timeouts\n"); 634 + printk(KERN_DEBUG 635 + "giveup: fall-backs to ipi-style shootdowns\n"); 636 + printk(KERN_DEBUG 637 + "sto: number of source timeouts\n"); 638 + printk(KERN_DEBUG 639 + "bz: number of stay-busy's\n"); 640 + printk(KERN_DEBUG 641 + "throt: number times spun in throttle\n"); 642 + printk(KERN_DEBUG "Destination side statistics:\n"); 643 + printk(KERN_DEBUG 644 + "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); 645 + printk(KERN_DEBUG 646 + "recv: shootdown messages received\n"); 647 + printk(KERN_DEBUG 648 + "rtime: time spent processing messages\n"); 649 + printk(KERN_DEBUG 650 + "all: shootdown all-tlb messages\n"); 651 + printk(KERN_DEBUG 652 + "one: shootdown one-tlb messages\n"); 653 + printk(KERN_DEBUG 654 + "mult: interrupts that found multiple messages\n"); 655 + printk(KERN_DEBUG 656 + "none: interrupts that found no messages\n"); 657 + printk(KERN_DEBUG 658 + "retry: number of retry messages processed\n"); 659 + printk(KERN_DEBUG 660 + "canc: number messages canceled by retries\n"); 661 + printk(KERN_DEBUG 662 + "nocan: number retries that found nothing to cancel\n"); 663 + printk(KERN_DEBUG 664 + "reset: number of ipi-style reset requests processed\n"); 665 + printk(KERN_DEBUG 666 + "rcan: number messages canceled by reset requests\n"); 667 + } else if (input_arg == -1) { 668 + for_each_present_cpu(cpu) { 669 + stat = &per_cpu(ptcstats, cpu); 670 + memset(stat, 0, sizeof(struct ptc_stats)); 671 + } 1001 672 } else { 1002 - uv_bau_retry_limit = newmode; 1003 - printk(KERN_DEBUG "timeout retry limit:%d\n", 1004 - uv_bau_retry_limit); 673 + uv_bau_max_concurrent = input_arg; 674 + bcp = &per_cpu(bau_control, smp_processor_id()); 675 + if (uv_bau_max_concurrent < 1 || 676 + uv_bau_max_concurrent > bcp->cpus_in_uvhub) { 677 + printk(KERN_DEBUG 678 + "Error: BAU max concurrent %d; %d is invalid\n", 679 + bcp->max_concurrent, uv_bau_max_concurrent); 680 + return -EINVAL; 681 + } 682 + printk(KERN_DEBUG "Set BAU max concurrent:%d\n", 683 + uv_bau_max_concurrent); 684 + for_each_present_cpu(cpu) { 685 + bcp = &per_cpu(bau_control, cpu); 686 + bcp->max_concurrent = uv_bau_max_concurrent; 687 + } 1005 688 } 1006 689 1007 690 return count; ··· 1122 651 } 1123 652 1124 653 /* 1125 - * begin the initialization of the per-blade control structures 1126 - */ 1127 - static struct bau_control * __init uv_table_bases_init(int blade, int node) 1128 - { 1129 - int i; 1130 - struct bau_msg_status *msp; 1131 - struct bau_control *bau_tabp; 1132 - 1133 - bau_tabp = 1134 - kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); 1135 - BUG_ON(!bau_tabp); 1136 - 1137 - bau_tabp->msg_statuses = 1138 - kmalloc_node(sizeof(struct bau_msg_status) * 1139 - DEST_Q_SIZE, GFP_KERNEL, node); 1140 - BUG_ON(!bau_tabp->msg_statuses); 1141 - 1142 - for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) 1143 - bau_cpubits_clear(&msp->seen_by, (int) 1144 - uv_blade_nr_possible_cpus(blade)); 1145 - 1146 - uv_bau_table_bases[blade] = bau_tabp; 1147 - 1148 - return bau_tabp; 1149 - } 1150 - 1151 - /* 1152 - * finish the initialization of the per-blade control structures 1153 - */ 1154 - static void __init 1155 - uv_table_bases_finish(int blade, 1156 - struct bau_control *bau_tablesp, 1157 - struct bau_desc *adp) 1158 - { 1159 - struct bau_control *bcp; 1160 - int cpu; 1161 - 1162 - for_each_present_cpu(cpu) { 1163 - if (blade != uv_cpu_to_blade_id(cpu)) 1164 - continue; 1165 - 1166 - bcp = (struct bau_control *)&per_cpu(bau_control, cpu); 1167 - bcp->bau_msg_head = bau_tablesp->va_queue_first; 1168 - bcp->va_queue_first = bau_tablesp->va_queue_first; 1169 - bcp->va_queue_last = bau_tablesp->va_queue_last; 1170 - bcp->msg_statuses = bau_tablesp->msg_statuses; 1171 - bcp->descriptor_base = adp; 1172 - } 1173 - } 1174 - 1175 - /* 1176 654 * initialize the sending side's sending buffers 1177 655 */ 1178 - static struct bau_desc * __init 656 + static void 1179 657 uv_activation_descriptor_init(int node, int pnode) 1180 658 { 1181 659 int i; 660 + int cpu; 1182 661 unsigned long pa; 1183 662 unsigned long m; 1184 663 unsigned long n; 1185 - struct bau_desc *adp; 1186 - struct bau_desc *ad2; 664 + struct bau_desc *bau_desc; 665 + struct bau_desc *bd2; 666 + struct bau_control *bcp; 1187 667 1188 668 /* 1189 669 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1190 - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade 670 + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub 1191 671 */ 1192 - adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 672 + bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* 1193 673 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1194 - BUG_ON(!adp); 674 + BUG_ON(!bau_desc); 1195 675 1196 - pa = uv_gpa(adp); /* need the real nasid*/ 1197 - n = uv_gpa_to_pnode(pa); 676 + pa = uv_gpa(bau_desc); /* need the real nasid*/ 677 + n = pa >> uv_nshift; 1198 678 m = pa & uv_mmask; 1199 679 1200 680 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, ··· 1154 732 /* 1155 733 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1156 734 * cpu even though we only use the first one; one descriptor can 1157 - * describe a broadcast to 256 nodes. 735 + * describe a broadcast to 256 uv hubs. 1158 736 */ 1159 - for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 1160 - i++, ad2++) { 1161 - memset(ad2, 0, sizeof(struct bau_desc)); 1162 - ad2->header.sw_ack_flag = 1; 737 + for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 738 + i++, bd2++) { 739 + memset(bd2, 0, sizeof(struct bau_desc)); 740 + bd2->header.sw_ack_flag = 1; 1163 741 /* 1164 - * base_dest_nodeid is the first node in the partition, so 1165 - * the bit map will indicate partition-relative node numbers. 1166 - * note that base_dest_nodeid is actually a nasid. 742 + * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub 743 + * in the partition. The bit map will indicate uvhub numbers, 744 + * which are 0-N in a partition. Pnodes are unique system-wide. 1167 745 */ 1168 - ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 1169 - ad2->header.dest_subnodeid = 0x10; /* the LB */ 1170 - ad2->header.command = UV_NET_ENDPOINT_INTD; 1171 - ad2->header.int_both = 1; 746 + bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; 747 + bd2->header.dest_subnodeid = 0x10; /* the LB */ 748 + bd2->header.command = UV_NET_ENDPOINT_INTD; 749 + bd2->header.int_both = 1; 1172 750 /* 1173 751 * all others need to be set to zero: 1174 752 * fairness chaining multilevel count replied_to 1175 753 */ 1176 754 } 1177 - return adp; 755 + for_each_present_cpu(cpu) { 756 + if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) 757 + continue; 758 + bcp = &per_cpu(bau_control, cpu); 759 + bcp->descriptor_base = bau_desc; 760 + } 1178 761 } 1179 762 1180 763 /* 1181 764 * initialize the destination side's receiving buffers 765 + * entered for each uvhub in the partition 766 + * - node is first node (kernel memory notion) on the uvhub 767 + * - pnode is the uvhub's physical identifier 1182 768 */ 1183 - static struct bau_payload_queue_entry * __init 1184 - uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) 769 + static void 770 + uv_payload_queue_init(int node, int pnode) 1185 771 { 1186 - struct bau_payload_queue_entry *pqp; 1187 - unsigned long pa; 1188 772 int pn; 773 + int cpu; 1189 774 char *cp; 775 + unsigned long pa; 776 + struct bau_payload_queue_entry *pqp; 777 + struct bau_payload_queue_entry *pqp_malloc; 778 + struct bau_control *bcp; 1190 779 1191 780 pqp = (struct bau_payload_queue_entry *) kmalloc_node( 1192 781 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), 1193 782 GFP_KERNEL, node); 1194 783 BUG_ON(!pqp); 784 + pqp_malloc = pqp; 1195 785 1196 786 cp = (char *)pqp + 31; 1197 787 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); 1198 - bau_tablesp->va_queue_first = pqp; 788 + 789 + for_each_present_cpu(cpu) { 790 + if (pnode != uv_cpu_to_pnode(cpu)) 791 + continue; 792 + /* for every cpu on this pnode: */ 793 + bcp = &per_cpu(bau_control, cpu); 794 + bcp->va_queue_first = pqp; 795 + bcp->bau_msg_head = pqp; 796 + bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); 797 + } 1199 798 /* 1200 799 * need the pnode of where the memory was really allocated 1201 800 */ 1202 801 pa = uv_gpa(pqp); 1203 - pn = uv_gpa_to_pnode(pa); 802 + pn = pa >> uv_nshift; 1204 803 uv_write_global_mmr64(pnode, 1205 804 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 1206 805 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 1207 806 uv_physnodeaddr(pqp)); 1208 807 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, 1209 808 uv_physnodeaddr(pqp)); 1210 - bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); 1211 809 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, 1212 810 (unsigned long) 1213 - uv_physnodeaddr(bau_tablesp->va_queue_last)); 811 + uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); 812 + /* in effect, all msg_type's are set to MSG_NOOP */ 1214 813 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); 1215 - 1216 - return pqp; 1217 814 } 1218 815 1219 816 /* 1220 - * Initialization of each UV blade's structures 817 + * Initialization of each UV hub's structures 1221 818 */ 1222 - static int __init uv_init_blade(int blade) 819 + static void __init uv_init_uvhub(int uvhub, int vector) 1223 820 { 1224 821 int node; 1225 822 int pnode; 1226 - unsigned long pa; 1227 823 unsigned long apicid; 1228 - struct bau_desc *adp; 1229 - struct bau_payload_queue_entry *pqp; 1230 - struct bau_control *bau_tablesp; 1231 824 1232 - node = blade_to_first_node(blade); 1233 - bau_tablesp = uv_table_bases_init(blade, node); 1234 - pnode = uv_blade_to_pnode(blade); 1235 - adp = uv_activation_descriptor_init(node, pnode); 1236 - pqp = uv_payload_queue_init(node, pnode, bau_tablesp); 1237 - uv_table_bases_finish(blade, bau_tablesp, adp); 825 + node = uvhub_to_first_node(uvhub); 826 + pnode = uv_blade_to_pnode(uvhub); 827 + uv_activation_descriptor_init(node, pnode); 828 + uv_payload_queue_init(node, pnode); 1238 829 /* 1239 830 * the below initialization can't be in firmware because the 1240 831 * messaging IRQ will be determined by the OS 1241 832 */ 1242 - apicid = blade_to_first_apicid(blade); 1243 - pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 833 + apicid = uvhub_to_first_apicid(uvhub); 1244 834 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1245 - ((apicid << 32) | UV_BAU_MESSAGE)); 1246 - return 0; 835 + ((apicid << 32) | vector)); 836 + } 837 + 838 + /* 839 + * initialize the bau_control structure for each cpu 840 + */ 841 + static void uv_init_per_cpu(int nuvhubs) 842 + { 843 + int i, j, k; 844 + int cpu; 845 + int pnode; 846 + int uvhub; 847 + short socket = 0; 848 + struct bau_control *bcp; 849 + struct uvhub_desc *bdp; 850 + struct socket_desc *sdp; 851 + struct bau_control *hmaster = NULL; 852 + struct bau_control *smaster = NULL; 853 + struct socket_desc { 854 + short num_cpus; 855 + short cpu_number[16]; 856 + }; 857 + struct uvhub_desc { 858 + short num_sockets; 859 + short num_cpus; 860 + short uvhub; 861 + short pnode; 862 + struct socket_desc socket[2]; 863 + }; 864 + struct uvhub_desc *uvhub_descs; 865 + 866 + uvhub_descs = (struct uvhub_desc *) 867 + kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); 868 + memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); 869 + for_each_present_cpu(cpu) { 870 + bcp = &per_cpu(bau_control, cpu); 871 + memset(bcp, 0, sizeof(struct bau_control)); 872 + spin_lock_init(&bcp->masks_lock); 873 + bcp->max_concurrent = uv_bau_max_concurrent; 874 + pnode = uv_cpu_hub_info(cpu)->pnode; 875 + uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 876 + bdp = &uvhub_descs[uvhub]; 877 + bdp->num_cpus++; 878 + bdp->uvhub = uvhub; 879 + bdp->pnode = pnode; 880 + /* time interval to catch a hardware stay-busy bug */ 881 + bcp->timeout_interval = millisec_2_cycles(3); 882 + /* kludge: assume uv_hub.h is constant */ 883 + socket = (cpu_physical_id(cpu)>>5)&1; 884 + if (socket >= bdp->num_sockets) 885 + bdp->num_sockets = socket+1; 886 + sdp = &bdp->socket[socket]; 887 + sdp->cpu_number[sdp->num_cpus] = cpu; 888 + sdp->num_cpus++; 889 + } 890 + socket = 0; 891 + for_each_possible_blade(uvhub) { 892 + bdp = &uvhub_descs[uvhub]; 893 + for (i = 0; i < bdp->num_sockets; i++) { 894 + sdp = &bdp->socket[i]; 895 + for (j = 0; j < sdp->num_cpus; j++) { 896 + cpu = sdp->cpu_number[j]; 897 + bcp = &per_cpu(bau_control, cpu); 898 + bcp->cpu = cpu; 899 + if (j == 0) { 900 + smaster = bcp; 901 + if (i == 0) 902 + hmaster = bcp; 903 + } 904 + bcp->cpus_in_uvhub = bdp->num_cpus; 905 + bcp->cpus_in_socket = sdp->num_cpus; 906 + bcp->socket_master = smaster; 907 + bcp->uvhub_master = hmaster; 908 + for (k = 0; k < DEST_Q_SIZE; k++) 909 + bcp->socket_acknowledge_count[k] = 0; 910 + bcp->uvhub_cpu = 911 + uv_cpu_hub_info(cpu)->blade_processor_id; 912 + } 913 + socket++; 914 + } 915 + } 916 + kfree(uvhub_descs); 1247 917 } 1248 918 1249 919 /* ··· 1343 829 */ 1344 830 static int __init uv_bau_init(void) 1345 831 { 1346 - int blade; 1347 - int nblades; 832 + int uvhub; 833 + int pnode; 834 + int nuvhubs; 1348 835 int cur_cpu; 836 + int vector; 837 + unsigned long mmr; 1349 838 1350 839 if (!is_uv_system()) 840 + return 0; 841 + 842 + if (nobau) 1351 843 return 0; 1352 844 1353 845 for_each_possible_cpu(cur_cpu) 1354 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1355 847 GFP_KERNEL, cpu_to_node(cur_cpu)); 1356 848 1357 - uv_bau_retry_limit = 1; 849 + uv_bau_max_concurrent = MAX_BAU_CONCURRENT; 850 + uv_nshift = uv_hub_info->m_val; 1358 851 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1359 - nblades = uv_num_possible_blades(); 852 + nuvhubs = uv_num_possible_blades(); 1360 853 1361 - uv_bau_table_bases = (struct bau_control **) 1362 - kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); 1363 - BUG_ON(!uv_bau_table_bases); 854 + uv_init_per_cpu(nuvhubs); 1364 855 1365 856 uv_partition_base_pnode = 0x7fffffff; 1366 - for (blade = 0; blade < nblades; blade++) 1367 - if (uv_blade_nr_possible_cpus(blade) && 1368 - (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) 1369 - uv_partition_base_pnode = uv_blade_to_pnode(blade); 1370 - for (blade = 0; blade < nblades; blade++) 1371 - if (uv_blade_nr_possible_cpus(blade)) 1372 - uv_init_blade(blade); 857 + for (uvhub = 0; uvhub < nuvhubs; uvhub++) 858 + if (uv_blade_nr_possible_cpus(uvhub) && 859 + (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) 860 + uv_partition_base_pnode = uv_blade_to_pnode(uvhub); 1373 861 1374 - alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 862 + vector = UV_BAU_MESSAGE; 863 + for_each_possible_blade(uvhub) 864 + if (uv_blade_nr_possible_cpus(uvhub)) 865 + uv_init_uvhub(uvhub, vector); 866 + 1375 867 uv_enable_timeouts(); 868 + alloc_intr_gate(vector, uv_bau_message_intr1); 869 + 870 + for_each_possible_blade(uvhub) { 871 + pnode = uv_blade_to_pnode(uvhub); 872 + /* INIT the bau */ 873 + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, 874 + ((unsigned long)1 << 63)); 875 + mmr = 1; /* should be 1 to broadcast to both sockets */ 876 + uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); 877 + } 1376 878 1377 879 return 0; 1378 880 } 1379 - __initcall(uv_bau_init); 1380 - __initcall(uv_ptc_init); 881 + core_initcall(uv_bau_init); 882 + core_initcall(uv_ptc_init);