Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[SCSI] Reduce error recovery time by reducing use of TURs

In error recovery, most scsi error recovery stages will send a TUR command
for every bad command when a driver's error handler reports success. When
several bad commands to the same device, this results in a device
being probed multiple times.

This becomes very problematic if the device or connection is in a state
where the device still doesn't respond to commands even after a recovery
function returns success. The error handler must wait for the test
commands to time out. The time waiting for the redundant commands can
drastically lengthen error recovery.

This patch alters the scsi mid-layer's error routines to send test commands
once per device instead of once per bad command. This can drastically
lower error recovery time.

[jejb: fixed up whitespace and formatting]
Signed-of-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: James Bottomley <jbottomley@parallels.com>

authored by

David Jeffery and committed by
James Bottomley
3eef6257 0bcaa111

+67 -20
+67 -20
drivers/scsi/scsi_error.c
··· 50 50 #define BUS_RESET_SETTLE_TIME (10) 51 51 #define HOST_RESET_SETTLE_TIME (10) 52 52 53 + static int scsi_eh_try_stu(struct scsi_cmnd *scmd); 54 + 53 55 /* called with shost->host_lock held */ 54 56 void scsi_eh_wakeup(struct Scsi_Host *shost) 55 57 { ··· 949 947 } 950 948 951 949 /** 950 + * scsi_eh_test_devices - check if devices are responding from error recovery. 951 + * @cmd_list: scsi commands in error recovery. 952 + * @work_q: queue for commands which still need more error recovery 953 + * @done_q: queue for commands which are finished 954 + * @try_stu: boolean on if a STU command should be tried in addition to TUR. 955 + * 956 + * Decription: 957 + * Tests if devices are in a working state. Commands to devices now in 958 + * a working state are sent to the done_q while commands to devices which 959 + * are still failing to respond are returned to the work_q for more 960 + * processing. 961 + **/ 962 + static int scsi_eh_test_devices(struct list_head *cmd_list, 963 + struct list_head *work_q, 964 + struct list_head *done_q, int try_stu) 965 + { 966 + struct scsi_cmnd *scmd, *next; 967 + struct scsi_device *sdev; 968 + int finish_cmds; 969 + 970 + while (!list_empty(cmd_list)) { 971 + scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry); 972 + sdev = scmd->device; 973 + 974 + finish_cmds = !scsi_device_online(scmd->device) || 975 + (try_stu && !scsi_eh_try_stu(scmd) && 976 + !scsi_eh_tur(scmd)) || 977 + !scsi_eh_tur(scmd); 978 + 979 + list_for_each_entry_safe(scmd, next, cmd_list, eh_entry) 980 + if (scmd->device == sdev) { 981 + if (finish_cmds) 982 + scsi_eh_finish_cmd(scmd, done_q); 983 + else 984 + list_move_tail(&scmd->eh_entry, work_q); 985 + } 986 + } 987 + return list_empty(work_q); 988 + } 989 + 990 + 991 + /** 952 992 * scsi_eh_abort_cmds - abort pending commands. 953 993 * @work_q: &list_head for pending commands. 954 994 * @done_q: &list_head for processed commands. ··· 1006 962 struct list_head *done_q) 1007 963 { 1008 964 struct scsi_cmnd *scmd, *next; 965 + LIST_HEAD(check_list); 1009 966 int rtn; 1010 967 1011 968 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { ··· 1018 973 rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd); 1019 974 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1020 975 scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD; 1021 - if (!scsi_device_online(scmd->device) || 1022 - rtn == FAST_IO_FAIL || 1023 - !scsi_eh_tur(scmd)) { 976 + if (rtn == FAST_IO_FAIL) 1024 977 scsi_eh_finish_cmd(scmd, done_q); 1025 - } 978 + else 979 + list_move_tail(&scmd->eh_entry, &check_list); 1026 980 } else 1027 981 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting" 1028 982 " cmd failed:" ··· 1030 986 scmd)); 1031 987 } 1032 988 1033 - return list_empty(work_q); 989 + return scsi_eh_test_devices(&check_list, work_q, done_q, 0); 1034 990 } 1035 991 1036 992 /** ··· 1181 1137 struct list_head *done_q) 1182 1138 { 1183 1139 LIST_HEAD(tmp_list); 1140 + LIST_HEAD(check_list); 1184 1141 1185 1142 list_splice_init(work_q, &tmp_list); 1186 1143 ··· 1206 1161 if (scmd_id(scmd) != id) 1207 1162 continue; 1208 1163 1209 - if ((rtn == SUCCESS || rtn == FAST_IO_FAIL) 1210 - && (!scsi_device_online(scmd->device) || 1211 - rtn == FAST_IO_FAIL || !scsi_eh_tur(scmd))) 1164 + if (rtn == SUCCESS) 1165 + list_move_tail(&scmd->eh_entry, &check_list); 1166 + else if (rtn == FAST_IO_FAIL) 1212 1167 scsi_eh_finish_cmd(scmd, done_q); 1213 1168 else 1214 1169 /* push back on work queue for further processing */ ··· 1216 1171 } 1217 1172 } 1218 1173 1219 - return list_empty(work_q); 1174 + return scsi_eh_test_devices(&check_list, work_q, done_q, 0); 1220 1175 } 1221 1176 1222 1177 /** ··· 1230 1185 struct list_head *done_q) 1231 1186 { 1232 1187 struct scsi_cmnd *scmd, *chan_scmd, *next; 1188 + LIST_HEAD(check_list); 1233 1189 unsigned int channel; 1234 1190 int rtn; 1235 1191 ··· 1262 1216 rtn = scsi_try_bus_reset(chan_scmd); 1263 1217 if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1264 1218 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { 1265 - if (channel == scmd_channel(scmd)) 1266 - if (!scsi_device_online(scmd->device) || 1267 - rtn == FAST_IO_FAIL || 1268 - !scsi_eh_tur(scmd)) 1219 + if (channel == scmd_channel(scmd)) { 1220 + if (rtn == FAST_IO_FAIL) 1269 1221 scsi_eh_finish_cmd(scmd, 1270 1222 done_q); 1223 + else 1224 + list_move_tail(&scmd->eh_entry, 1225 + &check_list); 1226 + } 1271 1227 } 1272 1228 } else { 1273 1229 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST" ··· 1278 1230 channel)); 1279 1231 } 1280 1232 } 1281 - return list_empty(work_q); 1233 + return scsi_eh_test_devices(&check_list, work_q, done_q, 0); 1282 1234 } 1283 1235 1284 1236 /** ··· 1290 1242 struct list_head *done_q) 1291 1243 { 1292 1244 struct scsi_cmnd *scmd, *next; 1245 + LIST_HEAD(check_list); 1293 1246 int rtn; 1294 1247 1295 1248 if (!list_empty(work_q)) { ··· 1301 1252 , current->comm)); 1302 1253 1303 1254 rtn = scsi_try_host_reset(scmd); 1304 - if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { 1255 + if (rtn == SUCCESS) { 1256 + list_splice_init(work_q, &check_list); 1257 + } else if (rtn == FAST_IO_FAIL) { 1305 1258 list_for_each_entry_safe(scmd, next, work_q, eh_entry) { 1306 - if (!scsi_device_online(scmd->device) || 1307 - rtn == FAST_IO_FAIL || 1308 - (!scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) || 1309 - !scsi_eh_tur(scmd)) 1310 1259 scsi_eh_finish_cmd(scmd, done_q); 1311 1260 } 1312 1261 } else { ··· 1313 1266 current->comm)); 1314 1267 } 1315 1268 } 1316 - return list_empty(work_q); 1269 + return scsi_eh_test_devices(&check_list, work_q, done_q, 1); 1317 1270 } 1318 1271 1319 1272 /**