IPoIB: Fix deadlock on RTNL between bcast join comp and ipoib_stop()

Taking rtnl_lock in ipoib_mcast_join_complete() causes a deadlock with
ipoib_stop(). We avoid it by scheduling the piece of code that takes
the lock on ipoib_workqueue instead of executing it directly. This
works because we only flush the ipoib_workqueue with the RTNL not held.

The deadlock happens because ipoib_stop() calls ipoib_ib_dev_down()
which calls ipoib_mcast_dev_flush(), which calls ipoib_mcast_free(),
which calls ipoib_mcast_leave(). The latter calls
ib_sa_free_multicast(), and this waits until the multicast completion
handler finishes. This handler is ipoib_mcast_join_complete(), which
waits for the rtnl_lock(), which was already taken by ipoib_stop().

This bug was introduced in commit a77a57a1 ("IPoIB: Fix deadlock on
RTNL in ipoib_stop()").

Signed-off-by: Yossi Etigin <yosefe@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by Yossi Etigin and committed by Roland Dreier e8224e4b 1941246d

+24 -10
+2
drivers/infiniband/ulp/ipoib/ipoib.h
··· 293 293 294 294 struct delayed_work pkey_poll_task; 295 295 struct delayed_work mcast_task; 296 + struct work_struct carrier_on_task; 296 297 struct work_struct flush_light; 297 298 struct work_struct flush_normal; 298 299 struct work_struct flush_heavy; ··· 465 464 void ipoib_dev_cleanup(struct net_device *dev); 466 465 467 466 void ipoib_mcast_join_task(struct work_struct *work); 467 + void ipoib_mcast_carrier_on_task(struct work_struct *work); 468 468 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); 469 469 470 470 void ipoib_mcast_restart_task(struct work_struct *work);
+1
drivers/infiniband/ulp/ipoib/ipoib_main.c
··· 1075 1075 1076 1076 INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); 1077 1077 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); 1078 + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); 1078 1079 INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); 1079 1080 INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); 1080 1081 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
+21 -10
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
··· 366 366 return ret; 367 367 } 368 368 369 + void ipoib_mcast_carrier_on_task(struct work_struct *work) 370 + { 371 + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, 372 + carrier_on_task); 373 + 374 + /* 375 + * Take rtnl_lock to avoid racing with ipoib_stop() and 376 + * turning the carrier back on while a device is being 377 + * removed. 378 + */ 379 + rtnl_lock(); 380 + netif_carrier_on(priv->dev); 381 + rtnl_unlock(); 382 + } 383 + 369 384 static int ipoib_mcast_join_complete(int status, 370 385 struct ib_sa_multicast *multicast) 371 386 { ··· 407 392 &priv->mcast_task, 0); 408 393 mutex_unlock(&mcast_mutex); 409 394 410 - if (mcast == priv->broadcast) { 411 - /* 412 - * Take RTNL lock here to avoid racing with 413 - * ipoib_stop() and turning the carrier back 414 - * on while a device is being removed. 415 - */ 416 - rtnl_lock(); 417 - netif_carrier_on(dev); 418 - rtnl_unlock(); 419 - } 395 + /* 396 + * Defer carrier on work to ipoib_workqueue to avoid a 397 + * deadlock on rtnl_lock here. 398 + */ 399 + if (mcast == priv->broadcast) 400 + queue_work(ipoib_workqueue, &priv->carrier_on_task); 420 401 421 402 return 0; 422 403 }