Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/pci: Fix cyclic dead-lock in zpci_zdev_put() and zpci_scan_devices()

When triggering PCI device recovery by writing into the SysFS attribute
`recover` of a Physical Function with existing child SR-IOV Virtual
Functions, lockdep is reporting a possible deadlock between three
threads:

Thread (A) Thread (B) Thread (C)
| | |
recover_store() zpci_scan_devices() zpci_scan_devices()
lock(pci_rescan_remove_lock) | |
| | |
| | zpci_bus_scan_busses()
| | lock(zbus_list_lock)
| zpci_add_device() |
| lock(zpci_add_remove_lock) |
| | ┴
| | zpci_bus_scan_bus()
| | lock(pci_rescan_remove_lock)
┴ |
zpci_zdev_put() |
lock(zpci_add_remove_lock) |

zpci_bus_get()
lock(zbus_list_lock)

In zpci_bus_scan_busses() the `zbus_list_lock` is taken for the whole
duration of the function, which also includes taking
`pci_rescan_remove_lock`, among other things. But `zbus_list_lock` only
really needs to protect the modification of the global registration
`zbus_list`, it can be dropped while the functions within the list
iteration run; this way we break the cycle above.

Break up zpci_bus_scan_busses() into an "iterator" zpci_bus_get_next()
that iterates over `zbus_list` element by element, and acquires and
releases `zbus_list_lock` as necessary, but never keep holding it.
References to `zpci_bus` objects are also acquired and released.

The reference counting on `zpci_bus` objects is also changed so that all
put() and get() operations are done under the protection of
`zbus_list_lock`, and if the operation results in a modification of
`zpci_bus_list`, this modification is done in the same critical section
(apart the very first initialization). This way objects are never seen
on the list that are about to be released and/or half-initialized.

Fixes: 14c87ba8123a ("s390/pci: separate zbus registration from scanning")
Suggested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Benjamin Block <bblock@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>

authored by

Benjamin Block and committed by
Heiko Carstens
4cb92fa7 b1aa01d3

+91 -29
+1
.clang-format
··· 748 748 - 'ynl_attr_for_each_nested' 749 749 - 'ynl_attr_for_each_payload' 750 750 - 'zorro_for_each_dev' 751 + - 'zpci_bus_for_each' 751 752 752 753 IncludeBlocks: Preserve 753 754 IncludeCategories:
+5 -1
arch/s390/pci/pci.c
··· 1148 1148 1149 1149 int zpci_scan_devices(void) 1150 1150 { 1151 + struct zpci_bus *zbus; 1151 1152 LIST_HEAD(scan_list); 1152 1153 int rc; 1153 1154 ··· 1157 1156 return rc; 1158 1157 1159 1158 zpci_add_devices(&scan_list); 1160 - zpci_bus_scan_busses(); 1159 + zpci_bus_for_each(zbus) { 1160 + zpci_bus_scan_bus(zbus); 1161 + cond_resched(); 1162 + } 1161 1163 return 0; 1162 1164 } 1163 1165
+71 -27
arch/s390/pci/pci_bus.c
··· 153 153 return ret; 154 154 } 155 155 156 - /* zpci_bus_scan_busses - Scan all registered busses 157 - * 158 - * Scan all available zbusses 159 - * 160 - */ 161 - void zpci_bus_scan_busses(void) 162 - { 163 - struct zpci_bus *zbus = NULL; 164 - 165 - mutex_lock(&zbus_list_lock); 166 - list_for_each_entry(zbus, &zbus_list, bus_next) { 167 - zpci_bus_scan_bus(zbus); 168 - cond_resched(); 169 - } 170 - mutex_unlock(&zbus_list_lock); 171 - } 172 - 173 156 static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) 174 157 { 175 158 return !s390_pci_no_rid && zdev->rid_available && ··· 205 222 return -ENOMEM; 206 223 } 207 224 208 - static void zpci_bus_release(struct kref *kref) 225 + /** 226 + * zpci_bus_release - Un-initialize resources associated with the zbus and 227 + * free memory 228 + * @kref: refcount * that is part of struct zpci_bus 229 + * 230 + * MUST be called with `zbus_list_lock` held, but the lock is released during 231 + * run of the function. 232 + */ 233 + static inline void zpci_bus_release(struct kref *kref) 234 + __releases(&zbus_list_lock) 209 235 { 210 236 struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); 237 + 238 + lockdep_assert_held(&zbus_list_lock); 239 + 240 + list_del(&zbus->bus_next); 241 + mutex_unlock(&zbus_list_lock); 242 + 243 + /* 244 + * At this point no-one should see this object, or be able to get a new 245 + * reference to it. 246 + */ 211 247 212 248 if (zbus->bus) { 213 249 pci_lock_rescan_remove(); ··· 239 237 pci_unlock_rescan_remove(); 240 238 } 241 239 242 - mutex_lock(&zbus_list_lock); 243 - list_del(&zbus->bus_next); 244 - mutex_unlock(&zbus_list_lock); 245 240 zpci_remove_parent_msi_domain(zbus); 246 241 kfree(zbus); 247 242 } 248 243 249 - static void zpci_bus_put(struct zpci_bus *zbus) 244 + static inline void __zpci_bus_get(struct zpci_bus *zbus) 250 245 { 251 - kref_put(&zbus->kref, zpci_bus_release); 246 + lockdep_assert_held(&zbus_list_lock); 247 + kref_get(&zbus->kref); 248 + } 249 + 250 + static inline void zpci_bus_put(struct zpci_bus *zbus) 251 + { 252 + kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock); 252 253 } 253 254 254 255 static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) ··· 263 258 if (!zbus->multifunction) 264 259 continue; 265 260 if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { 266 - kref_get(&zbus->kref); 261 + __zpci_bus_get(zbus); 267 262 goto out_unlock; 268 263 } 269 264 } ··· 271 266 out_unlock: 272 267 mutex_unlock(&zbus_list_lock); 273 268 return zbus; 269 + } 270 + 271 + /** 272 + * zpci_bus_get_next - get the next zbus object from given position in the list 273 + * @pos: current position/cursor in the global zbus list 274 + * 275 + * Acquires and releases references as the cursor iterates (might also free/ 276 + * release the cursor). Is tolerant of concurrent operations on the list. 277 + * 278 + * To begin the iteration, set *@pos to %NULL before calling the function. 279 + * 280 + * *@pos is set to %NULL in cases where either the list is empty, or *@pos is 281 + * the last element in the list. 282 + * 283 + * Context: Process context. May sleep. 284 + */ 285 + void zpci_bus_get_next(struct zpci_bus **pos) 286 + { 287 + struct zpci_bus *curp = *pos, *next = NULL; 288 + 289 + mutex_lock(&zbus_list_lock); 290 + if (curp) 291 + next = list_next_entry(curp, bus_next); 292 + else 293 + next = list_first_entry(&zbus_list, typeof(*curp), bus_next); 294 + 295 + if (list_entry_is_head(next, &zbus_list, bus_next)) 296 + next = NULL; 297 + 298 + if (next) 299 + __zpci_bus_get(next); 300 + 301 + *pos = next; 302 + mutex_unlock(&zbus_list_lock); 303 + 304 + /* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */ 305 + if (curp) 306 + zpci_bus_put(curp); 274 307 } 275 308 276 309 static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) ··· 322 279 zbus->topo = topo; 323 280 zbus->topo_is_tid = topo_is_tid; 324 281 INIT_LIST_HEAD(&zbus->bus_next); 325 - mutex_lock(&zbus_list_lock); 326 - list_add_tail(&zbus->bus_next, &zbus_list); 327 - mutex_unlock(&zbus_list_lock); 328 282 329 283 kref_init(&zbus->kref); 330 284 INIT_LIST_HEAD(&zbus->resources); ··· 330 290 zbus->bus_resource.end = ZPCI_BUS_NR; 331 291 zbus->bus_resource.flags = IORESOURCE_BUS; 332 292 pci_add_resource(&zbus->resources, &zbus->bus_resource); 293 + 294 + mutex_lock(&zbus_list_lock); 295 + list_add_tail(&zbus->bus_next, &zbus_list); 296 + mutex_unlock(&zbus_list_lock); 333 297 334 298 return zbus; 335 299 }
+14 -1
arch/s390/pci/pci_bus.h
··· 15 15 void zpci_bus_device_unregister(struct zpci_dev *zdev); 16 16 17 17 int zpci_bus_scan_bus(struct zpci_bus *zbus); 18 - void zpci_bus_scan_busses(void); 18 + void zpci_bus_get_next(struct zpci_bus **pos); 19 + 20 + /** 21 + * zpci_bus_for_each - iterate over all the registered zbus objects 22 + * @pos: a struct zpci_bus * as cursor 23 + * 24 + * Acquires and releases references as the cursor iterates over the registered 25 + * objects. Is tolerant against concurrent removals of objects. 26 + * 27 + * Context: Process context. May sleep. 28 + */ 29 + #define zpci_bus_for_each(pos) \ 30 + for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \ 31 + zpci_bus_get_next(&(pos))) 19 32 20 33 int zpci_bus_scan_device(struct zpci_dev *zdev); 21 34 void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);