x86, UV: BAU tunables into a debugfs file

Make the Broadcast Assist Unit driver's nine tuning values variable by
making them accessible through a read/write debugfs file.

The file will normally be mounted as
/sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each
cpu's per-cpu BAU structure.

The patch also does a little name improvement, and corrects the reset of
two destination timeout counters.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zx-Uo@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar e8e5e8a8 12a6611f

+281 -59
+38 -17
arch/x86/include/asm/uv/uv_bau.h
··· 45 45 #define UV_DESC_BASE_PNODE_SHIFT 49 46 46 #define UV_PAYLOADQ_PNODE_SHIFT 49 47 47 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 48 + #define UV_BAU_BASENAME "sgi_uv/bau_tunables" 49 + #define UV_BAU_TUNABLES_DIR "sgi_uv" 50 + #define UV_BAU_TUNABLES_FILE "bau_tunables" 51 + #define WHITESPACE " \t\n" 48 52 #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 49 53 #define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 50 54 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 51 - #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL 55 + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL 52 56 /* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ 53 57 #define BAU_MISC_CONTROL_MULT_MASK 3 54 58 ··· 74 70 #define DESC_STATUS_DESTINATION_TIMEOUT 2 75 71 #define DESC_STATUS_SOURCE_TIMEOUT 3 76 72 77 - /* 78 - * source side threshholds at which message retries print a warning 79 - */ 80 - #define SOURCE_TIMEOUT_LIMIT 20 81 - #define DESTINATION_TIMEOUT_LIMIT 20 82 - 83 - /* 84 - * misc. delays, in microseconds 85 - */ 86 - #define THROTTLE_DELAY 10 87 73 #define TIMEOUT_DELAY 10 88 - #define BIOS_TO 1000 89 - /* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */ 74 + /* 75 + * delay for 'plugged' timeout retries, in microseconds 76 + */ 77 + #define PLUGGED_DELAY 10 90 78 91 79 /* 92 80 * threshholds at which to use IPI to free resources 93 81 */ 82 + /* after this # consecutive 'plugged' timeouts, use IPI to release resources */ 94 83 #define PLUGSB4RESET 100 95 - #define TIMEOUTSB4RESET 100 84 + /* after this many consecutive timeouts, use IPI to release resources */ 85 + #define TIMEOUTSB4RESET 1 86 + /* at this number uses of IPI to release resources, giveup the request */ 87 + #define IPI_RESET_LIMIT 1 88 + /* after this # consecutive successes, bump up the throttle if it was lowered */ 89 + #define COMPLETE_THRESHOLD 5 96 90 97 91 /* 98 92 * number of entries in the destination side payload queue ··· 108 106 #define FLUSH_RETRY_TIMEOUT 2 109 107 #define FLUSH_GIVEUP 3 110 108 #define FLUSH_COMPLETE 4 109 + 110 + /* 111 + * tuning the action when the numalink network is extremely delayed 112 + */ 113 + #define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ 114 + #define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ 115 + #define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ 111 116 112 117 /* 113 118 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) ··· 332 323 struct bau_control *uvhub_master; 333 324 struct bau_control *socket_master; 334 325 unsigned long timeout_interval; 326 + unsigned long set_bau_on_time; 335 327 atomic_t active_descriptor_count; 336 - int max_concurrent; 337 - int max_concurrent_constant; 338 - int retry_message_scans; 339 328 int plugged_tries; 340 329 int timeout_tries; 341 330 int ipi_attempts; 342 331 int conseccompletes; 332 + int set_bau_off; 343 333 short cpu; 344 334 short uvhub_cpu; 345 335 short uvhub; ··· 351 343 spinlock_t masks_lock; 352 344 spinlock_t uvhub_lock; 353 345 spinlock_t queue_lock; 346 + /* tunables */ 347 + int max_bau_concurrent; 348 + int max_bau_concurrent_constant; 349 + int plugged_delay; 350 + int plugsb4reset; 351 + int timeoutsb4reset; 352 + int ipi_reset_limit; 353 + int complete_threshold; 354 + int congested_response_us; 355 + int congested_reps; 356 + int congested_period; 357 + cycles_t period_time; 358 + long period_requests; 354 359 }; 355 360 356 361 /*
+243 -42
arch/x86/kernel/tlb_uv.c
··· 8 8 */ 9 9 #include <linux/seq_file.h> 10 10 #include <linux/proc_fs.h> 11 + #include <linux/debugfs.h> 11 12 #include <linux/kernel.h> 12 13 #include <linux/slab.h> 13 14 ··· 43 42 167772160 44 43 }; 45 44 static int timeout_us; 46 - 47 - #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL 48 - 49 - static int uv_bau_max_concurrent __read_mostly; 50 - 51 45 static int nobau; 46 + 47 + /* tunables: */ 48 + static int max_bau_concurrent = MAX_BAU_CONCURRENT; 49 + static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; 50 + static int plugged_delay = PLUGGED_DELAY; 51 + static int plugsb4reset = PLUGSB4RESET; 52 + static int timeoutsb4reset = TIMEOUTSB4RESET; 53 + static int ipi_reset_limit = IPI_RESET_LIMIT; 54 + static int complete_threshold = COMPLETE_THRESHOLD; 55 + static int congested_response_us = CONGESTED_RESPONSE_US; 56 + static int congested_reps = CONGESTED_REPS; 57 + static int congested_period = CONGESTED_PERIOD; 58 + static struct dentry *tunables_dir; 59 + static struct dentry *tunables_file; 60 + 52 61 static int __init setup_nobau(char *arg) 53 62 { 54 63 nobau = 1; ··· 550 539 unsigned long index; 551 540 cycles_t time1; 552 541 cycles_t time2; 542 + cycles_t elapsed; 553 543 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); 554 544 struct bau_control *smaster = bcp->socket_master; 555 545 struct bau_control *hmaster = bcp->uvhub_master; 556 546 557 547 /* 558 - * Spin here while there are hmaster->max_concurrent or more active 548 + * Spin here while there are hmaster->max_bau_concurrent or more active 559 549 * descriptors. This is the per-uvhub 'throttle'. 560 550 */ 561 551 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 562 552 &hmaster->active_descriptor_count, 563 - hmaster->max_concurrent)) { 553 + hmaster->max_bau_concurrent)) { 564 554 stat->s_throttles++; 565 555 do { 566 556 cpu_relax(); 567 557 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 568 558 &hmaster->active_descriptor_count, 569 - hmaster->max_concurrent)); 559 + hmaster->max_bau_concurrent)); 570 560 } 571 561 572 562 while (hmaster->uvhub_quiesce) ··· 621 609 * that case hardware immediately returns the ERROR 622 610 * that looks like a destination timeout. 623 611 */ 624 - udelay(TIMEOUT_DELAY); 612 + udelay(bcp->plugged_delay); 625 613 bcp->plugged_tries++; 626 - if (bcp->plugged_tries >= PLUGSB4RESET) { 614 + if (bcp->plugged_tries >= bcp->plugsb4reset) { 627 615 bcp->plugged_tries = 0; 628 616 quiesce_local_uvhub(hmaster); 629 617 spin_lock(&hmaster->queue_lock); ··· 635 623 stat->s_resets_plug++; 636 624 } 637 625 } else if (completion_status == FLUSH_RETRY_TIMEOUT) { 638 - hmaster->max_concurrent = 1; 626 + hmaster->max_bau_concurrent = 1; 639 627 bcp->timeout_tries++; 640 628 udelay(TIMEOUT_DELAY); 641 - if (bcp->timeout_tries >= TIMEOUTSB4RESET) { 629 + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { 642 630 bcp->timeout_tries = 0; 643 631 quiesce_local_uvhub(hmaster); 644 632 spin_lock(&hmaster->queue_lock); ··· 650 638 stat->s_resets_timeout++; 651 639 } 652 640 } 653 - if (bcp->ipi_attempts >= 3) { 641 + if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 654 642 bcp->ipi_attempts = 0; 655 643 completion_status = FLUSH_GIVEUP; 656 644 break; ··· 660 648 (completion_status == FLUSH_RETRY_TIMEOUT)); 661 649 time2 = get_cycles(); 662 650 663 - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) 664 - && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) 665 - hmaster->max_concurrent++; 651 + bcp->plugged_tries = 0; 652 + bcp->timeout_tries = 0; 653 + 654 + if ((completion_status == FLUSH_COMPLETE) && 655 + (bcp->conseccompletes > bcp->complete_threshold) && 656 + (hmaster->max_bau_concurrent < 657 + hmaster->max_bau_concurrent_constant)) 658 + hmaster->max_bau_concurrent++; 666 659 667 660 /* 668 661 * hold any cpu not timing out here; no other cpu currently held by ··· 678 661 atomic_dec(&hmaster->active_descriptor_count); 679 662 680 663 /* guard against cycles wrap */ 681 - if (time2 > time1) 682 - stat->s_time += (time2 - time1); 683 - else 664 + if (time2 > time1) { 665 + elapsed = time2 - time1; 666 + stat->s_time += elapsed; 667 + } else 684 668 stat->s_requestor--; /* don't count this one */ 685 669 if (completion_status == FLUSH_COMPLETE && try > 1) 686 670 stat->s_retriesok++; ··· 748 730 struct ptc_stats *stat; 749 731 struct bau_control *bcp; 750 732 733 + /* kernel was booted 'nobau' */ 751 734 if (nobau) 752 735 return cpumask; 753 736 754 737 bcp = &per_cpu(bau_control, cpu); 738 + 755 739 /* 756 740 * Each sending cpu has a per-cpu mask which it fills from the caller's 757 741 * cpu mask. Only remote cpus are converted to uvhubs and copied. ··· 990 970 stat->s_resets_plug, stat->s_resets_timeout, 991 971 stat->s_giveup, stat->s_stimeout, 992 972 stat->s_busy, stat->s_throttles); 973 + 993 974 /* destination side statistics */ 994 975 seq_printf(file, 995 976 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", ··· 1007 986 } 1008 987 1009 988 /* 989 + * Display the tunables thru debugfs 990 + */ 991 + static ssize_t tunables_read(struct file *file, char __user *userbuf, 992 + size_t count, loff_t *ppos) 993 + { 994 + char buf[300]; 995 + int ret; 996 + 997 + ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 998 + "max_bau_concurrent plugged_delay plugsb4reset", 999 + "timeoutsb4reset ipi_reset_limit complete_threshold", 1000 + "congested_response_us congested_reps congested_period", 1001 + max_bau_concurrent, plugged_delay, plugsb4reset, 1002 + timeoutsb4reset, ipi_reset_limit, complete_threshold, 1003 + congested_response_us, congested_reps, congested_period); 1004 + 1005 + return simple_read_from_buffer(userbuf, count, ppos, buf, ret); 1006 + } 1007 + 1008 + /* 1010 1009 * -1: resetf the statistics 1011 1010 * 0: display meaning of the statistics 1012 - * >0: maximum concurrent active descriptors per uvhub (throttle) 1013 1011 */ 1014 1012 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 1015 1013 size_t count, loff_t *data) ··· 1037 997 long input_arg; 1038 998 char optstr[64]; 1039 999 struct ptc_stats *stat; 1040 - struct bau_control *bcp; 1041 1000 1042 1001 if (count == 0 || count > sizeof(optstr)) 1043 1002 return -EINVAL; ··· 1117 1078 stat = &per_cpu(ptcstats, cpu); 1118 1079 memset(stat, 0, sizeof(struct ptc_stats)); 1119 1080 } 1120 - } else { 1121 - uv_bau_max_concurrent = input_arg; 1122 - bcp = &per_cpu(bau_control, smp_processor_id()); 1123 - if (uv_bau_max_concurrent < 1 || 1124 - uv_bau_max_concurrent > bcp->cpus_in_uvhub) { 1125 - printk(KERN_DEBUG 1126 - "Error: BAU max concurrent %d; %d is invalid\n", 1127 - bcp->max_concurrent, uv_bau_max_concurrent); 1128 - return -EINVAL; 1129 - } 1130 - printk(KERN_DEBUG "Set BAU max concurrent:%d\n", 1131 - uv_bau_max_concurrent); 1132 - for_each_present_cpu(cpu) { 1133 - bcp = &per_cpu(bau_control, cpu); 1134 - bcp->max_concurrent = uv_bau_max_concurrent; 1135 - } 1136 1081 } 1137 1082 1083 + return count; 1084 + } 1085 + 1086 + static int local_atoi(const char *name) 1087 + { 1088 + int val = 0; 1089 + 1090 + for (;; name++) { 1091 + switch (*name) { 1092 + case '0' ... '9': 1093 + val = 10*val+(*name-'0'); 1094 + break; 1095 + default: 1096 + return val; 1097 + } 1098 + } 1099 + } 1100 + 1101 + /* 1102 + * set the tunables 1103 + * 0 values reset them to defaults 1104 + */ 1105 + static ssize_t tunables_write(struct file *file, const char __user *user, 1106 + size_t count, loff_t *data) 1107 + { 1108 + int cpu; 1109 + int cnt = 0; 1110 + int val; 1111 + char *p; 1112 + char *q; 1113 + char instr[64]; 1114 + struct bau_control *bcp; 1115 + 1116 + if (count == 0 || count > sizeof(instr)-1) 1117 + return -EINVAL; 1118 + if (copy_from_user(instr, user, count)) 1119 + return -EFAULT; 1120 + 1121 + instr[count] = '\0'; 1122 + /* count the fields */ 1123 + p = instr + strspn(instr, WHITESPACE); 1124 + q = p; 1125 + for (; *p; p = q + strspn(q, WHITESPACE)) { 1126 + q = p + strcspn(p, WHITESPACE); 1127 + cnt++; 1128 + if (q == p) 1129 + break; 1130 + } 1131 + if (cnt != 9) { 1132 + printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); 1133 + return -EINVAL; 1134 + } 1135 + 1136 + p = instr + strspn(instr, WHITESPACE); 1137 + q = p; 1138 + for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { 1139 + q = p + strcspn(p, WHITESPACE); 1140 + val = local_atoi(p); 1141 + switch (cnt) { 1142 + case 0: 1143 + if (val == 0) { 1144 + max_bau_concurrent = MAX_BAU_CONCURRENT; 1145 + max_bau_concurrent_constant = 1146 + MAX_BAU_CONCURRENT; 1147 + continue; 1148 + } 1149 + bcp = &per_cpu(bau_control, smp_processor_id()); 1150 + if (val < 1 || val > bcp->cpus_in_uvhub) { 1151 + printk(KERN_DEBUG 1152 + "Error: BAU max concurrent %d is invalid\n", 1153 + val); 1154 + return -EINVAL; 1155 + } 1156 + max_bau_concurrent = val; 1157 + max_bau_concurrent_constant = val; 1158 + continue; 1159 + case 1: 1160 + if (val == 0) 1161 + plugged_delay = PLUGGED_DELAY; 1162 + else 1163 + plugged_delay = val; 1164 + continue; 1165 + case 2: 1166 + if (val == 0) 1167 + plugsb4reset = PLUGSB4RESET; 1168 + else 1169 + plugsb4reset = val; 1170 + continue; 1171 + case 3: 1172 + if (val == 0) 1173 + timeoutsb4reset = TIMEOUTSB4RESET; 1174 + else 1175 + timeoutsb4reset = val; 1176 + continue; 1177 + case 4: 1178 + if (val == 0) 1179 + ipi_reset_limit = IPI_RESET_LIMIT; 1180 + else 1181 + ipi_reset_limit = val; 1182 + continue; 1183 + case 5: 1184 + if (val == 0) 1185 + complete_threshold = COMPLETE_THRESHOLD; 1186 + else 1187 + complete_threshold = val; 1188 + continue; 1189 + case 6: 1190 + if (val == 0) 1191 + congested_response_us = CONGESTED_RESPONSE_US; 1192 + else 1193 + congested_response_us = val; 1194 + continue; 1195 + case 7: 1196 + if (val == 0) 1197 + congested_reps = CONGESTED_REPS; 1198 + else 1199 + congested_reps = val; 1200 + continue; 1201 + case 8: 1202 + if (val == 0) 1203 + congested_period = CONGESTED_PERIOD; 1204 + else 1205 + congested_period = val; 1206 + continue; 1207 + } 1208 + if (q == p) 1209 + break; 1210 + } 1211 + for_each_present_cpu(cpu) { 1212 + bcp = &per_cpu(bau_control, cpu); 1213 + bcp->max_bau_concurrent = max_bau_concurrent; 1214 + bcp->max_bau_concurrent_constant = max_bau_concurrent; 1215 + bcp->plugged_delay = plugged_delay; 1216 + bcp->plugsb4reset = plugsb4reset; 1217 + bcp->timeoutsb4reset = timeoutsb4reset; 1218 + bcp->ipi_reset_limit = ipi_reset_limit; 1219 + bcp->complete_threshold = complete_threshold; 1220 + bcp->congested_response_us = congested_response_us; 1221 + bcp->congested_reps = congested_reps; 1222 + bcp->congested_period = congested_period; 1223 + } 1138 1224 return count; 1139 1225 } 1140 1226 ··· 1275 1111 return seq_open(file, &uv_ptc_seq_ops); 1276 1112 } 1277 1113 1114 + static int tunables_open(struct inode *inode, struct file *file) 1115 + { 1116 + return 0; 1117 + } 1118 + 1278 1119 static const struct file_operations proc_uv_ptc_operations = { 1279 1120 .open = uv_ptc_proc_open, 1280 1121 .read = seq_read, 1281 1122 .write = uv_ptc_proc_write, 1282 1123 .llseek = seq_lseek, 1283 1124 .release = seq_release, 1125 + }; 1126 + 1127 + static const struct file_operations tunables_fops = { 1128 + .open = tunables_open, 1129 + .read = tunables_read, 1130 + .write = tunables_write, 1284 1131 }; 1285 1132 1286 1133 static int __init uv_ptc_init(void) ··· 1306 1131 if (!proc_uv_ptc) { 1307 1132 printk(KERN_ERR "unable to create %s proc entry\n", 1308 1133 UV_PTC_BASENAME); 1134 + return -EINVAL; 1135 + } 1136 + 1137 + tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); 1138 + if (!tunables_dir) { 1139 + printk(KERN_ERR "unable to create debugfs directory %s\n", 1140 + UV_BAU_TUNABLES_DIR); 1141 + return -EINVAL; 1142 + } 1143 + tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, 1144 + tunables_dir, NULL, &tunables_fops); 1145 + if (!tunables_file) { 1146 + printk(KERN_ERR "unable to create debugfs file %s\n", 1147 + UV_BAU_TUNABLES_FILE); 1309 1148 return -EINVAL; 1310 1149 } 1311 1150 return 0; ··· 1525 1336 bcp = &per_cpu(bau_control, cpu); 1526 1337 memset(bcp, 0, sizeof(struct bau_control)); 1527 1338 spin_lock_init(&bcp->masks_lock); 1528 - bcp->max_concurrent = uv_bau_max_concurrent; 1529 1339 pnode = uv_cpu_hub_info(cpu)->pnode; 1530 1340 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1531 1341 bdp = &uvhub_descs[uvhub]; 1532 1342 bdp->num_cpus++; 1533 1343 bdp->uvhub = uvhub; 1534 1344 bdp->pnode = pnode; 1535 - /* time interval to catch a hardware stay-busy bug */ 1536 - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); 1537 1345 /* kludge: assume uv_hub.h is constant */ 1538 1346 socket = (cpu_physical_id(cpu)>>5)&1; 1539 1347 if (socket >= bdp->num_sockets) ··· 1566 1380 } 1567 1381 } 1568 1382 kfree(uvhub_descs); 1383 + for_each_present_cpu(cpu) { 1384 + bcp = &per_cpu(bau_control, cpu); 1385 + /* time interval to catch a hardware stay-busy bug */ 1386 + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); 1387 + bcp->max_bau_concurrent = max_bau_concurrent; 1388 + bcp->max_bau_concurrent_constant = max_bau_concurrent; 1389 + bcp->plugged_delay = plugged_delay; 1390 + bcp->plugsb4reset = plugsb4reset; 1391 + bcp->timeoutsb4reset = timeoutsb4reset; 1392 + bcp->ipi_reset_limit = ipi_reset_limit; 1393 + bcp->complete_threshold = complete_threshold; 1394 + bcp->congested_response_us = congested_response_us; 1395 + bcp->congested_reps = congested_reps; 1396 + bcp->congested_period = congested_period; 1397 + } 1569 1398 } 1570 1399 1571 1400 /* ··· 1605 1404 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1606 1405 GFP_KERNEL, cpu_to_node(cur_cpu)); 1607 1406 1608 - uv_bau_max_concurrent = MAX_BAU_CONCURRENT; 1407 + max_bau_concurrent = MAX_BAU_CONCURRENT; 1609 1408 uv_nshift = uv_hub_info->m_val; 1610 1409 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1611 1410 nuvhubs = uv_num_possible_blades(); ··· 1638 1437 return 0; 1639 1438 } 1640 1439 core_initcall(uv_bau_init); 1641 - core_initcall(uv_ptc_init); 1440 + fs_initcall(uv_ptc_init);