Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma fixes from Jason Gunthorpe:

- Fix several syzkaller found bugs:
- Poor parsing of the RDMA_NL_LS_OP_IP_RESOLVE netlink
- GID entry refcount leaking when CM destruction races with
multicast establishment
- Missing refcount put in ib_del_sub_device_and_put()

- Fixup recently introduced uABI padding for 32 bit consistency

- Avoid user triggered math overflow in MANA and AFA

- Reading invalid netdev data during an event

- kdoc fixes

- Fix never-working gid copying in ib_get_gids_from_rdma_hdr

- Typo in bnxt when validating the BAR

- bnxt mis-parsed IB_SEND_IP_CSUM so it didn't work always

- bnxt out of bounds access in bnxt related to the counters on new
devices

- Allocate the bnxt PDE table with the right sizing

- Use dma_free_coherent() correctly in bnxt

- Allow rxe to be unloadable when CONFIG_PROVE_LOCKING by adjusting the
tracking of the global sockets it uses

- Missing unlocking on error path in rxe

- Compute the right number of pages in a MR in rtrs

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
RDMA/bnxt_re: fix dma_free_coherent() pointer
RDMA/rtrs: Fix clt_path::max_pages_per_mr calculation
IB/rxe: Fix missing umem_odp->umem_mutex unlock on error path
RDMA/bnxt_re: Fix to use correct page size for PDE table
RDMA/bnxt_re: Fix OOB write in bnxt_re_copy_err_stats()
RDMA/bnxt_re: Fix IB_SEND_IP_CSUM handling in post_send
RDMA/core: always drop device refcount in ib_del_sub_device_and_put()
RDMA/rxe: let rxe_reclassify_recv_socket() call sk_owner_put()
RDMA/bnxt_re: Fix incorrect BAR check in bnxt_qplib_map_creq_db()
RDMA/core: Fix logic error in ib_get_gids_from_rdma_hdr()
RDMA/efa: Remove possible negative shift
RTRS/rtrs: clean up rtrs headers kernel-doc
RDMA/irdma: avoid invalid read in irdma_net_event
RDMA/mana_ib: check cqe length for kernel CQs
RDMA/irdma: Fix irdma_alloc_ucontext_resp padding
RDMA/ucma: Fix rdma_ucm_query_ib_service_resp struct padding
RDMA/cm: Fix leaking the multicast GID table reference
RDMA/core: Check for the presence of LS_NLA_TYPE_DGID correctly

Changed files
+107 -68
drivers
include
+10 -23
drivers/infiniband/core/addr.c
··· 80 80 .min = sizeof(struct rdma_nla_ls_gid)}, 81 81 }; 82 82 83 - static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) 83 + static void ib_nl_process_ip_rsep(const struct nlmsghdr *nlh) 84 84 { 85 85 struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; 86 + union ib_gid gid; 87 + struct addr_req *req; 88 + int found = 0; 86 89 int ret; 87 90 88 91 if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) 89 - return false; 92 + return; 90 93 91 94 ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), 92 95 nlmsg_len(nlh), ib_nl_addr_policy, NULL); 93 96 if (ret) 94 - return false; 97 + return; 95 98 96 - return true; 97 - } 98 - 99 - static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) 100 - { 101 - const struct nlattr *head, *curr; 102 - union ib_gid gid; 103 - struct addr_req *req; 104 - int len, rem; 105 - int found = 0; 106 - 107 - head = (const struct nlattr *)nlmsg_data(nlh); 108 - len = nlmsg_len(nlh); 109 - 110 - nla_for_each_attr(curr, head, len, rem) { 111 - if (curr->nla_type == LS_NLA_TYPE_DGID) 112 - memcpy(&gid, nla_data(curr), nla_len(curr)); 113 - } 99 + if (!tb[LS_NLA_TYPE_DGID]) 100 + return; 101 + memcpy(&gid, nla_data(tb[LS_NLA_TYPE_DGID]), sizeof(gid)); 114 102 115 103 spin_lock_bh(&lock); 116 104 list_for_each_entry(req, &req_list, list) { ··· 125 137 !(NETLINK_CB(skb).sk)) 126 138 return -EPERM; 127 139 128 - if (ib_nl_is_good_ip_resp(nlh)) 129 - ib_nl_process_good_ip_rsep(nlh); 140 + ib_nl_process_ip_rsep(nlh); 130 141 131 142 return 0; 132 143 }
+3
drivers/infiniband/core/cma.c
··· 2009 2009 ib_sa_free_multicast(mc->sa_mc); 2010 2010 2011 2011 if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { 2012 + struct rdma_cm_event *event = &mc->iboe_join.event; 2012 2013 struct rdma_dev_addr *dev_addr = 2013 2014 &id_priv->id.route.addr.dev_addr; 2014 2015 struct net_device *ndev = NULL; ··· 2032 2031 dev_put(ndev); 2033 2032 2034 2033 cancel_work_sync(&mc->iboe_join.work); 2034 + if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) 2035 + rdma_destroy_ah_attr(&event->param.ud.ah_attr); 2035 2036 } 2036 2037 kfree(mc); 2037 2038 }
+3 -1
drivers/infiniband/core/device.c
··· 2881 2881 { 2882 2882 struct ib_device *parent = sub->parent; 2883 2883 2884 - if (!parent) 2884 + if (!parent) { 2885 + ib_device_put(sub); 2885 2886 return -EOPNOTSUPP; 2887 + } 2886 2888 2887 2889 mutex_lock(&parent->subdev_lock); 2888 2890 list_del(&sub->subdev_list);
+1 -1
drivers/infiniband/core/verbs.c
··· 738 738 (struct in6_addr *)dgid); 739 739 return 0; 740 740 } else if (net_type == RDMA_NETWORK_IPV6 || 741 - net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { 741 + net_type == RDMA_NETWORK_IB || net_type == RDMA_NETWORK_ROCE_V1) { 742 742 *dgid = hdr->ibgrh.dgid; 743 743 *sgid = hdr->ibgrh.sgid; 744 744 return 0;
+3 -3
drivers/infiniband/hw/bnxt_re/hw_counters.h
··· 89 89 BNXT_RE_RES_SRQ_LOAD_ERR, 90 90 BNXT_RE_RES_TX_PCI_ERR, 91 91 BNXT_RE_RES_RX_PCI_ERR, 92 + BNXT_RE_REQ_CQE_ERROR, 93 + BNXT_RE_RESP_CQE_ERROR, 94 + BNXT_RE_RESP_REMOTE_ACCESS_ERRS, 92 95 BNXT_RE_OUT_OF_SEQ_ERR, 93 96 BNXT_RE_TX_ATOMIC_REQ, 94 97 BNXT_RE_TX_READ_REQ, ··· 113 110 BNXT_RE_TX_CNP, 114 111 BNXT_RE_RX_CNP, 115 112 BNXT_RE_RX_ECN, 116 - BNXT_RE_REQ_CQE_ERROR, 117 - BNXT_RE_RESP_CQE_ERROR, 118 - BNXT_RE_RESP_REMOTE_ACCESS_ERRS, 119 113 BNXT_RE_NUM_EXT_COUNTERS 120 114 }; 121 115
+1 -6
drivers/infiniband/hw/bnxt_re/ib_verbs.c
··· 2919 2919 wqe.rawqp1.lflags |= 2920 2920 SQ_SEND_RAWETH_QP1_LFLAGS_ROCE_CRC; 2921 2921 } 2922 - switch (wr->send_flags) { 2923 - case IB_SEND_IP_CSUM: 2922 + if (wr->send_flags & IB_SEND_IP_CSUM) 2924 2923 wqe.rawqp1.lflags |= 2925 2924 SQ_SEND_RAWETH_QP1_LFLAGS_IP_CHKSUM; 2926 - break; 2927 - default: 2928 - break; 2929 - } 2930 2925 fallthrough; 2931 2926 case IB_WR_SEND_WITH_INV: 2932 2927 rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
+1 -1
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
··· 1112 1112 creq_db->dbinfo.flags = 0; 1113 1113 creq_db->reg.bar_id = RCFW_COMM_CONS_PCI_BAR_REGION; 1114 1114 creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id); 1115 - if (!creq_db->reg.bar_id) 1115 + if (!creq_db->reg.bar_base) 1116 1116 dev_err(&pdev->dev, 1117 1117 "QPLIB: CREQ BAR region %d resc start is 0!", 1118 1118 creq_db->reg.bar_id);
+3 -5
drivers/infiniband/hw/bnxt_re/qplib_res.c
··· 64 64 for (i = 0; i < pbl->pg_count; i++) { 65 65 if (pbl->pg_arr[i]) 66 66 dma_free_coherent(&pdev->dev, pbl->pg_size, 67 - (void *)((unsigned long) 68 - pbl->pg_arr[i] & 69 - PAGE_MASK), 67 + pbl->pg_arr[i], 70 68 pbl->pg_map_arr[i]); 71 69 else 72 70 dev_warn(&pdev->dev, ··· 235 237 if (npbl % BIT(MAX_PDL_LVL_SHIFT)) 236 238 npde++; 237 239 /* Alloc PDE pages */ 238 - sginfo.pgsize = npde * pg_size; 240 + sginfo.pgsize = npde * ROCE_PG_SIZE_4K; 239 241 sginfo.npages = 1; 240 242 rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo); 241 243 if (rc) ··· 243 245 244 246 /* Alloc PBL pages */ 245 247 sginfo.npages = npbl; 246 - sginfo.pgsize = PAGE_SIZE; 248 + sginfo.pgsize = ROCE_PG_SIZE_4K; 247 249 rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_1], &sginfo); 248 250 if (rc) 249 251 goto fail;
-4
drivers/infiniband/hw/efa/efa_verbs.c
··· 1320 1320 u32 hp_cnt, 1321 1321 u8 hp_shift) 1322 1322 { 1323 - u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT); 1324 1323 struct ib_block_iter biter; 1325 1324 unsigned int hp_idx = 0; 1326 - 1327 - ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n", 1328 - hp_cnt, pages_in_hp); 1329 1325 1330 1326 rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift)) 1331 1327 page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
+2 -1
drivers/infiniband/hw/irdma/utils.c
··· 251 251 void *ptr) 252 252 { 253 253 struct neighbour *neigh = ptr; 254 - struct net_device *real_dev, *netdev = (struct net_device *)neigh->dev; 254 + struct net_device *real_dev, *netdev; 255 255 struct irdma_device *iwdev; 256 256 struct ib_device *ibdev; 257 257 __be32 *p; ··· 260 260 261 261 switch (event) { 262 262 case NETEVENT_NEIGH_UPDATE: 263 + netdev = neigh->dev; 263 264 real_dev = rdma_vlan_dev_real_dev(netdev); 264 265 if (!real_dev) 265 266 real_dev = netdev;
+4
drivers/infiniband/hw/mana/cq.c
··· 56 56 doorbell = mana_ucontext->doorbell; 57 57 } else { 58 58 is_rnic_cq = true; 59 + if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) { 60 + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); 61 + return -EINVAL; 62 + } 59 63 buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe * COMP_ENTRY_SIZE)); 60 64 cq->cqe = buf_size / COMP_ENTRY_SIZE; 61 65 err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ, &cq->queue);
+32
drivers/infiniband/sw/rxe/rxe_net.c
··· 64 64 break; 65 65 default: 66 66 WARN_ON_ONCE(1); 67 + return; 67 68 } 69 + /* 70 + * sock_lock_init_class_and_name() calls 71 + * sk_owner_set(sk, THIS_MODULE); in order 72 + * to make sure the referenced global 73 + * variables rxe_recv_slock_key and 74 + * rxe_recv_sk_key are not removed 75 + * before the socket is closed. 76 + * 77 + * However this prevents rxe_net_exit() 78 + * from being called and 'rmmod rdma_rxe' 79 + * is refused because of the references. 80 + * 81 + * For the global sockets in recv_sockets, 82 + * we are sure that rxe_net_exit() will call 83 + * rxe_release_udp_tunnel -> udp_tunnel_sock_release. 84 + * 85 + * So we don't need the additional reference to 86 + * our own (THIS_MODULE). 87 + */ 88 + sk_owner_put(sk); 89 + /* 90 + * We also call sk_owner_clear() otherwise 91 + * sk_owner_put(sk) in sk_prot_free will 92 + * fail, which is called via 93 + * sk_free -> __sk_free -> sk_destruct 94 + * and sk_destruct calls __sk_destruct 95 + * directly or via call_rcu() 96 + * so sk_prot_free() might be called 97 + * after rxe_net_exit(). 98 + */ 99 + sk_owner_clear(sk); 68 100 #endif /* CONFIG_DEBUG_LOCK_ALLOC */ 69 101 } 70 102
+3 -1
drivers/infiniband/sw/rxe/rxe_odp.c
··· 179 179 return err; 180 180 181 181 need_fault = rxe_check_pagefault(umem_odp, iova, length); 182 - if (need_fault) 182 + if (need_fault) { 183 + mutex_unlock(&umem_odp->umem_mutex); 183 184 return -EFAULT; 185 + } 184 186 } 185 187 186 188 return 0;
+1
drivers/infiniband/ulp/rtrs/rtrs-clt.c
··· 1464 1464 mr_page_shift = max(12, ffs(ib_dev->attrs.page_size_cap) - 1); 1465 1465 max_pages_per_mr = ib_dev->attrs.max_mr_size; 1466 1466 do_div(max_pages_per_mr, (1ull << mr_page_shift)); 1467 + max_pages_per_mr = min_not_zero((u32)max_pages_per_mr, U32_MAX); 1467 1468 clt_path->max_pages_per_mr = 1468 1469 min3(clt_path->max_pages_per_mr, (u32)max_pages_per_mr, 1469 1470 ib_dev->attrs.max_fast_reg_page_list_len);
+21 -11
drivers/infiniband/ulp/rtrs/rtrs-pri.h
··· 150 150 151 151 /** 152 152 * enum rtrs_msg_flags - RTRS message flags. 153 - * @RTRS_NEED_INVAL: Send invalidation in response. 153 + * @RTRS_MSG_NEED_INVAL_F: Send invalidation in response. 154 154 * @RTRS_MSG_NEW_RKEY_F: Send refreshed rkey in response. 155 155 */ 156 156 enum rtrs_msg_flags { ··· 179 179 * @recon_cnt: Reconnections counter 180 180 * @sess_uuid: UUID of a session (path) 181 181 * @paths_uuid: UUID of a group of sessions (paths) 182 - * 182 + * @first_conn: %1 if the connection request is the first for that session, 183 + * otherwise %0 183 184 * NOTE: max size 56 bytes, see man rdma_connect(). 184 185 */ 185 186 struct rtrs_msg_conn_req { 186 - /* Is set to 0 by cma.c in case of AF_IB, do not touch that. 187 - * see https://www.spinics.net/lists/linux-rdma/msg22397.html 187 + /** 188 + * @__cma_version: Is set to 0 by cma.c in case of AF_IB, do not touch 189 + * that. See https://www.spinics.net/lists/linux-rdma/msg22397.html 188 190 */ 189 191 u8 __cma_version; 190 - /* On sender side that should be set to 0, or cma_save_ip_info() 191 - * extract garbage and will fail. 192 + /** 193 + * @__ip_version: On sender side that should be set to 0, or 194 + * cma_save_ip_info() extract garbage and will fail. 192 195 */ 193 196 u8 __ip_version; 194 197 __le16 magic; ··· 202 199 uuid_t sess_uuid; 203 200 uuid_t paths_uuid; 204 201 u8 first_conn : 1; 202 + /* private: */ 205 203 u8 reserved_bits : 7; 206 204 u8 reserved[11]; 207 205 }; ··· 215 211 * @queue_depth: max inflight messages (queue-depth) in this session 216 212 * @max_io_size: max io size server supports 217 213 * @max_hdr_size: max msg header size server supports 214 + * @flags: RTRS message flags for this message 218 215 * 219 216 * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept(). 220 217 */ ··· 227 222 __le32 max_io_size; 228 223 __le32 max_hdr_size; 229 224 __le32 flags; 225 + /* private: */ 230 226 u8 reserved[36]; 231 227 }; 232 228 233 229 /** 234 - * struct rtrs_msg_info_req 230 + * struct rtrs_msg_info_req - client additional info request 235 231 * @type: @RTRS_MSG_INFO_REQ 236 232 * @pathname: Path name chosen by client 237 233 */ 238 234 struct rtrs_msg_info_req { 239 235 __le16 type; 240 236 u8 pathname[NAME_MAX]; 237 + /* private: */ 241 238 u8 reserved[15]; 242 239 }; 243 240 244 241 /** 245 - * struct rtrs_msg_info_rsp 242 + * struct rtrs_msg_info_rsp - server additional info response 246 243 * @type: @RTRS_MSG_INFO_RSP 247 244 * @sg_cnt: Number of @desc entries 248 245 * @desc: RDMA buffers where the client can write to server ··· 252 245 struct rtrs_msg_info_rsp { 253 246 __le16 type; 254 247 __le16 sg_cnt; 248 + /* private: */ 255 249 u8 reserved[4]; 250 + /* public: */ 256 251 struct rtrs_sg_desc desc[]; 257 252 }; 258 253 259 254 /** 260 - * struct rtrs_msg_rkey_rsp 255 + * struct rtrs_msg_rkey_rsp - server refreshed rkey response 261 256 * @type: @RTRS_MSG_RKEY_RSP 262 257 * @buf_id: RDMA buf_id of the new rkey 263 258 * @rkey: new remote key for RDMA buffers id from server ··· 273 264 /** 274 265 * struct rtrs_msg_rdma_read - RDMA data transfer request from client 275 266 * @type: always @RTRS_MSG_READ 267 + * @flags: RTRS message flags (enum rtrs_msg_flags) 276 268 * @usr_len: length of user payload 277 269 * @sg_cnt: number of @desc entries 278 270 * @desc: RDMA buffers where the server can write the result to ··· 287 277 }; 288 278 289 279 /** 290 - * struct_msg_rdma_write - Message transferred to server with RDMA-Write 280 + * struct rtrs_msg_rdma_write - Message transferred to server with RDMA-Write 291 281 * @type: always @RTRS_MSG_WRITE 292 282 * @usr_len: length of user payload 293 283 */ ··· 297 287 }; 298 288 299 289 /** 300 - * struct_msg_rdma_hdr - header for read or write request 290 + * struct rtrs_msg_rdma_hdr - header for read or write request 301 291 * @type: @RTRS_MSG_WRITE | @RTRS_MSG_READ 302 292 */ 303 293 struct rtrs_msg_rdma_hdr {
+15 -9
drivers/infiniband/ulp/rtrs/rtrs.h
··· 24 24 25 25 /** 26 26 * enum rtrs_clt_link_ev - Events about connectivity state of a client 27 - * @RTRS_CLT_LINK_EV_RECONNECTED Client was reconnected. 28 - * @RTRS_CLT_LINK_EV_DISCONNECTED Client was disconnected. 27 + * @RTRS_CLT_LINK_EV_RECONNECTED: Client was reconnected. 28 + * @RTRS_CLT_LINK_EV_DISCONNECTED: Client was disconnected. 29 29 */ 30 30 enum rtrs_clt_link_ev { 31 31 RTRS_CLT_LINK_EV_RECONNECTED, ··· 33 33 }; 34 34 35 35 /** 36 - * Source and destination address of a path to be established 36 + * struct rtrs_addr - Source and destination address of a path to be established 37 + * @src: source address 38 + * @dst: destination address 37 39 */ 38 40 struct rtrs_addr { 39 41 struct sockaddr_storage *src; ··· 43 41 }; 44 42 45 43 /** 46 - * rtrs_clt_ops - it holds the link event callback and private pointer. 44 + * struct rtrs_clt_ops - it holds the link event callback and private pointer. 47 45 * @priv: User supplied private data. 48 46 * @link_ev: Event notification callback function for connection state changes 49 47 * @priv: User supplied data that was passed to rtrs_clt_open() ··· 69 67 }; 70 68 71 69 /** 72 - * enum rtrs_clt_con_type() type of ib connection to use with a given 70 + * enum rtrs_clt_con_type - type of ib connection to use with a given 73 71 * rtrs_permit 74 - * @ADMIN_CON - use connection reserved for "service" messages 75 - * @IO_CON - use a connection reserved for IO 72 + * @RTRS_ADMIN_CON: use connection reserved for "service" messages 73 + * @RTRS_IO_CON: use a connection reserved for IO 76 74 */ 77 75 enum rtrs_clt_con_type { 78 76 RTRS_ADMIN_CON, ··· 87 85 struct rtrs_permit *permit); 88 86 89 87 /** 90 - * rtrs_clt_req_ops - it holds the request confirmation callback 88 + * struct rtrs_clt_req_ops - it holds the request confirmation callback 91 89 * and a private pointer. 92 90 * @priv: User supplied private data. 93 91 * @conf_fn: callback function to be called as confirmation ··· 107 105 int rtrs_clt_rdma_cq_direct(struct rtrs_clt_sess *clt, unsigned int index); 108 106 109 107 /** 110 - * rtrs_attrs - RTRS session attributes 108 + * struct rtrs_attrs - RTRS session attributes 109 + * @queue_depth: queue_depth saved from rtrs_clt_sess message 110 + * @max_io_size: max_io_size from rtrs_clt_sess message, capped to 111 + * @max_segments * %SZ_4K 112 + * @max_segments: max_segments saved from rtrs_clt_sess message 111 113 */ 112 114 struct rtrs_attrs { 113 115 u32 queue_depth;
+1 -1
include/uapi/rdma/irdma-abi.h
··· 57 57 __u8 rsvd2; 58 58 __aligned_u64 comp_mask; 59 59 __u16 min_hw_wq_size; 60 + __u8 revd3[2]; 60 61 __u32 max_hw_srq_quanta; 61 - __u8 rsvd3[2]; 62 62 }; 63 63 64 64 struct irdma_alloc_pd_resp {
+3 -1
include/uapi/rdma/rdma_user_cm.h
··· 192 192 193 193 struct rdma_ucm_query_ib_service_resp { 194 194 __u32 num_service_recs; 195 + __u32 reserved; 195 196 struct ib_user_service_rec recs[]; 196 197 }; 197 198 ··· 355 354 356 355 #define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64 357 356 struct rdma_ucm_ib_service { 358 - __u64 service_id; 357 + __aligned_u64 service_id; 359 358 __u8 service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE]; 360 359 __u32 flags; 361 360 __u32 reserved; ··· 363 362 364 363 struct rdma_ucm_resolve_ib_service { 365 364 __u32 id; 365 + __u32 reserved; 366 366 struct rdma_ucm_ib_service ibs; 367 367 }; 368 368