Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

scsi: target: Fix ordered tag handling

This patch fixes the following bugs:

1. If there are multiple ordered cmds queued and multiple simple cmds
completing, target_restart_delayed_cmds() could be called on different
CPUs and each instance could start a ordered cmd. They could then run in
different orders than they were queued.

2. target_restart_delayed_cmds() and target_handle_task_attr() can race
where:

1. target_handle_task_attr() has passed the simple_cmds == 0 check.

2. transport_complete_task_attr() then decrements simple_cmds to 0.

3. transport_complete_task_attr() runs target_restart_delayed_cmds() and
it does not see any cmds on the delayed_cmd_list.

4. target_handle_task_attr() adds the cmd to the delayed_cmd_list.

The cmd will then end up timing out.

3. If we are sent > 1 ordered cmds and simple_cmds == 0, we can execute
them out of order, because target_handle_task_attr() will hit that
simple_cmds check first and return false for all ordered cmds sent.

4. We run target_restart_delayed_cmds() after every cmd completion, so if
there is more than 1 simple cmd running, we start executing ordered cmds
after that first cmd instead of waiting for all of them to complete.

5. Ordered cmds are not supposed to start until HEAD OF QUEUE and all older
cmds have completed, and not just simple.

6. It's not a bug but it doesn't make sense to take the delayed_cmd_lock
for every cmd completion when ordered cmds are almost never used. Just
replacing that lock with an atomic increases IOPs by up to 10% when
completions are spread over multiple CPUs and there are multiple
sessions/ mqs/thread accessing the same device.

This patch moves the queued delayed handling to a per device work to
serialze the cmd executions for each device and adds a new counter to track
HEAD_OF_QUEUE and SIMPLE cmds. We can then check the new counter to
determine when to run the work on the completion path.

Link: https://lore.kernel.org/r/20210930020422.92578-3-michael.christie@oracle.com
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

authored by

Mike Christie and committed by
Martin K. Petersen
ed1227e0 945a1607

+61 -24
+2
drivers/target/target_core_device.c
··· 772 772 INIT_LIST_HEAD(&dev->t10_alua.lba_map_list); 773 773 spin_lock_init(&dev->t10_alua.lba_map_lock); 774 774 775 + INIT_WORK(&dev->delayed_cmd_work, target_do_delayed_work); 776 + 775 777 dev->t10_wwn.t10_dev = dev; 776 778 /* 777 779 * Use OpenFabrics IEEE Company ID: 00 14 05
+1
drivers/target/target_core_internal.h
··· 151 151 void transport_clear_lun_ref(struct se_lun *); 152 152 sense_reason_t target_cmd_size_check(struct se_cmd *cmd, unsigned int size); 153 153 void target_qf_do_work(struct work_struct *work); 154 + void target_do_delayed_work(struct work_struct *work); 154 155 bool target_check_wce(struct se_device *dev); 155 156 bool target_check_fua(struct se_device *dev); 156 157 void __target_execute_cmd(struct se_cmd *, bool);
+54 -22
drivers/target/target_core_transport.c
··· 2173 2173 */ 2174 2174 switch (cmd->sam_task_attr) { 2175 2175 case TCM_HEAD_TAG: 2176 + atomic_inc_mb(&dev->non_ordered); 2176 2177 pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n", 2177 2178 cmd->t_task_cdb[0]); 2178 2179 return false; 2179 2180 case TCM_ORDERED_TAG: 2180 - atomic_inc_mb(&dev->dev_ordered_sync); 2181 + atomic_inc_mb(&dev->delayed_cmd_count); 2181 2182 2182 2183 pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n", 2183 2184 cmd->t_task_cdb[0]); 2184 - 2185 - /* 2186 - * Execute an ORDERED command if no other older commands 2187 - * exist that need to be completed first. 2188 - */ 2189 - if (!atomic_read(&dev->simple_cmds)) 2190 - return false; 2191 2185 break; 2192 2186 default: 2193 2187 /* 2194 2188 * For SIMPLE and UNTAGGED Task Attribute commands 2195 2189 */ 2196 - atomic_inc_mb(&dev->simple_cmds); 2190 + atomic_inc_mb(&dev->non_ordered); 2191 + 2192 + if (atomic_read(&dev->delayed_cmd_count) == 0) 2193 + return false; 2197 2194 break; 2198 2195 } 2199 2196 2200 - if (atomic_read(&dev->dev_ordered_sync) == 0) 2201 - return false; 2197 + if (cmd->sam_task_attr != TCM_ORDERED_TAG) { 2198 + atomic_inc_mb(&dev->delayed_cmd_count); 2199 + /* 2200 + * We will account for this when we dequeue from the delayed 2201 + * list. 2202 + */ 2203 + atomic_dec_mb(&dev->non_ordered); 2204 + } 2202 2205 2203 2206 spin_lock_irq(&cmd->t_state_lock); 2204 2207 cmd->transport_state &= ~CMD_T_SENT; ··· 2213 2210 2214 2211 pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn", 2215 2212 cmd->t_task_cdb[0], cmd->sam_task_attr); 2213 + /* 2214 + * We may have no non ordered cmds when this function started or we 2215 + * could have raced with the last simple/head cmd completing, so kick 2216 + * the delayed handler here. 2217 + */ 2218 + schedule_work(&dev->delayed_cmd_work); 2216 2219 return true; 2217 2220 } 2218 2221 ··· 2252 2243 * Process all commands up to the last received ORDERED task attribute which 2253 2244 * requires another blocking boundary 2254 2245 */ 2255 - static void target_restart_delayed_cmds(struct se_device *dev) 2246 + void target_do_delayed_work(struct work_struct *work) 2256 2247 { 2257 - for (;;) { 2248 + struct se_device *dev = container_of(work, struct se_device, 2249 + delayed_cmd_work); 2250 + 2251 + spin_lock(&dev->delayed_cmd_lock); 2252 + while (!dev->ordered_sync_in_progress) { 2258 2253 struct se_cmd *cmd; 2259 2254 2260 - spin_lock(&dev->delayed_cmd_lock); 2261 - if (list_empty(&dev->delayed_cmd_list)) { 2262 - spin_unlock(&dev->delayed_cmd_lock); 2255 + if (list_empty(&dev->delayed_cmd_list)) 2263 2256 break; 2264 - } 2265 2257 2266 2258 cmd = list_entry(dev->delayed_cmd_list.next, 2267 2259 struct se_cmd, se_delayed_node); 2260 + 2261 + if (cmd->sam_task_attr == TCM_ORDERED_TAG) { 2262 + /* 2263 + * Check if we started with: 2264 + * [ordered] [simple] [ordered] 2265 + * and we are now at the last ordered so we have to wait 2266 + * for the simple cmd. 2267 + */ 2268 + if (atomic_read(&dev->non_ordered) > 0) 2269 + break; 2270 + 2271 + dev->ordered_sync_in_progress = true; 2272 + } 2273 + 2268 2274 list_del(&cmd->se_delayed_node); 2275 + atomic_dec_mb(&dev->delayed_cmd_count); 2269 2276 spin_unlock(&dev->delayed_cmd_lock); 2277 + 2278 + if (cmd->sam_task_attr != TCM_ORDERED_TAG) 2279 + atomic_inc_mb(&dev->non_ordered); 2270 2280 2271 2281 cmd->transport_state |= CMD_T_SENT; 2272 2282 2273 2283 __target_execute_cmd(cmd, true); 2274 2284 2275 - if (cmd->sam_task_attr == TCM_ORDERED_TAG) 2276 - break; 2285 + spin_lock(&dev->delayed_cmd_lock); 2277 2286 } 2287 + spin_unlock(&dev->delayed_cmd_lock); 2278 2288 } 2279 2289 2280 2290 /* ··· 2311 2283 goto restart; 2312 2284 2313 2285 if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { 2314 - atomic_dec_mb(&dev->simple_cmds); 2286 + atomic_dec_mb(&dev->non_ordered); 2315 2287 dev->dev_cur_ordered_id++; 2316 2288 } else if (cmd->sam_task_attr == TCM_HEAD_TAG) { 2289 + atomic_dec_mb(&dev->non_ordered); 2317 2290 dev->dev_cur_ordered_id++; 2318 2291 pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n", 2319 2292 dev->dev_cur_ordered_id); 2320 2293 } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) { 2321 - atomic_dec_mb(&dev->dev_ordered_sync); 2294 + spin_lock(&dev->delayed_cmd_lock); 2295 + dev->ordered_sync_in_progress = false; 2296 + spin_unlock(&dev->delayed_cmd_lock); 2322 2297 2323 2298 dev->dev_cur_ordered_id++; 2324 2299 pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", ··· 2330 2299 cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET; 2331 2300 2332 2301 restart: 2333 - target_restart_delayed_cmds(dev); 2302 + if (atomic_read(&dev->delayed_cmd_count) > 0) 2303 + schedule_work(&dev->delayed_cmd_work); 2334 2304 } 2335 2305 2336 2306 static void transport_complete_qf(struct se_cmd *cmd)
+4 -2
include/target/target_core_base.h
··· 812 812 atomic_long_t read_bytes; 813 813 atomic_long_t write_bytes; 814 814 /* Active commands on this virtual SE device */ 815 - atomic_t simple_cmds; 816 - atomic_t dev_ordered_sync; 815 + atomic_t non_ordered; 816 + bool ordered_sync_in_progress; 817 + atomic_t delayed_cmd_count; 817 818 atomic_t dev_qf_count; 818 819 u32 export_count; 819 820 spinlock_t delayed_cmd_lock; ··· 835 834 struct list_head dev_sep_list; 836 835 struct list_head dev_tmr_list; 837 836 struct work_struct qf_work_queue; 837 + struct work_struct delayed_cmd_work; 838 838 struct list_head delayed_cmd_list; 839 839 struct list_head qf_cmd_list; 840 840 /* Pointer to associated SE HBA */