Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
"This cycle we got a new RDMA driver "ERDMA" for the Alibaba cloud
environment. Otherwise the changes are dominated by rxe fixes.

There is another RDMA driver on the list that might get merged next
cycle, 'MANA' for the Azure cloud environment.

Summary:

- Bug fixes and small features for irdma, hns, siw, qedr, hfi1, mlx5

- General spelling/grammer fixes

- rdma cm can follow changes in neighbours for control packets

- Significant amounts of rxe fixes and spec compliance changes

- Use the modern NAPI API

- Use the bitmap API instead of open coding

- Performance improvements for rtrs

- Add the ERDMA driver for Alibaba cloud

- Fix a use after free bug in SRP"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (99 commits)
RDMA/ib_srpt: Unify checking rdma_cm_id condition in srpt_cm_req_recv()
RDMA/rxe: Fix error unwind in rxe_create_qp()
RDMA/mlx5: Add missing check for return value in get namespace flow
RDMA/rxe: Split qp state for requester and completer
RDMA/rxe: Generate error completion for error requester QP state
RDMA/rxe: Update wqe_index for each wqe error completion
RDMA/srpt: Fix a use-after-free
RDMA/srpt: Introduce a reference count in struct srpt_device
RDMA/srpt: Duplicate port name members
IB/qib: Fix repeated "in" within comments
RDMA/erdma: Add driver to kernel build environment
RDMA/erdma: Add the ABI definitions
RDMA/erdma: Add the erdma module
RDMA/erdma: Add connection management (CM) support
RDMA/erdma: Add verbs implementation
RDMA/erdma: Add verbs header file
RDMA/erdma: Add event queue implementation
RDMA/erdma: Add cmdq implementation
RDMA/erdma: Add main include file
RDMA/erdma: Add the hardware related definitions
...

+8369 -1008
+8
MAINTAINERS
··· 736 736 F: Documentation/i2c/busses/i2c-ali1563.rst 737 737 F: drivers/i2c/busses/i2c-ali1563.c 738 738 739 + ALIBABA ELASTIC RDMA DRIVER 740 + M: Cheng Xu <chengyou@linux.alibaba.com> 741 + M: Kai Shen <kaishen@linux.alibaba.com> 742 + L: linux-rdma@vger.kernel.org 743 + S: Supported 744 + F: drivers/infiniband/hw/erdma 745 + F: include/uapi/rdma/erdma-abi.h 746 + 739 747 ALIENWARE WMI DRIVER 740 748 L: Dell.Client.Kernel@dell.com 741 749 S: Maintained
+8 -7
drivers/infiniband/Kconfig
··· 78 78 def_bool !HIGHMEM 79 79 80 80 if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS 81 - source "drivers/infiniband/hw/mthca/Kconfig" 82 - source "drivers/infiniband/hw/qib/Kconfig" 81 + source "drivers/infiniband/hw/bnxt_re/Kconfig" 83 82 source "drivers/infiniband/hw/cxgb4/Kconfig" 84 83 source "drivers/infiniband/hw/efa/Kconfig" 84 + source "drivers/infiniband/hw/erdma/Kconfig" 85 + source "drivers/infiniband/hw/hfi1/Kconfig" 86 + source "drivers/infiniband/hw/hns/Kconfig" 85 87 source "drivers/infiniband/hw/irdma/Kconfig" 86 88 source "drivers/infiniband/hw/mlx4/Kconfig" 87 89 source "drivers/infiniband/hw/mlx5/Kconfig" 90 + source "drivers/infiniband/hw/mthca/Kconfig" 88 91 source "drivers/infiniband/hw/ocrdma/Kconfig" 89 - source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" 90 - source "drivers/infiniband/hw/usnic/Kconfig" 91 - source "drivers/infiniband/hw/hns/Kconfig" 92 - source "drivers/infiniband/hw/bnxt_re/Kconfig" 93 - source "drivers/infiniband/hw/hfi1/Kconfig" 94 92 source "drivers/infiniband/hw/qedr/Kconfig" 93 + source "drivers/infiniband/hw/qib/Kconfig" 94 + source "drivers/infiniband/hw/usnic/Kconfig" 95 + source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" 95 96 source "drivers/infiniband/sw/rdmavt/Kconfig" 96 97 source "drivers/infiniband/sw/rxe/Kconfig" 97 98 source "drivers/infiniband/sw/siw/Kconfig"
+218 -12
drivers/infiniband/core/cma.c
··· 11 11 #include <linux/in6.h> 12 12 #include <linux/mutex.h> 13 13 #include <linux/random.h> 14 + #include <linux/rbtree.h> 14 15 #include <linux/igmp.h> 15 16 #include <linux/xarray.h> 16 17 #include <linux/inetdevice.h> ··· 21 20 22 21 #include <net/net_namespace.h> 23 22 #include <net/netns/generic.h> 23 + #include <net/netevent.h> 24 24 #include <net/tcp.h> 25 25 #include <net/ipv6.h> 26 26 #include <net/ip_fib.h> ··· 170 168 static LIST_HEAD(dev_list); 171 169 static LIST_HEAD(listen_any_list); 172 170 static DEFINE_MUTEX(lock); 171 + static struct rb_root id_table = RB_ROOT; 172 + /* Serialize operations of id_table tree */ 173 + static DEFINE_SPINLOCK(id_table_lock); 173 174 static struct workqueue_struct *cma_wq; 174 175 static unsigned int cma_pernet_id; 175 176 ··· 206 201 return NULL; 207 202 } 208 203 } 204 + 205 + struct id_table_entry { 206 + struct list_head id_list; 207 + struct rb_node rb_node; 208 + }; 209 209 210 210 struct cma_device { 211 211 struct list_head list; ··· 430 420 return hdr->ip_version >> 4; 431 421 } 432 422 433 - static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) 423 + static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) 434 424 { 435 425 hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); 426 + } 427 + 428 + static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) 429 + { 430 + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; 431 + } 432 + 433 + static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) 434 + { 435 + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; 436 436 } 437 437 438 438 static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) ··· 463 443 rtnl_unlock(); 464 444 } 465 445 return (in_dev) ? 0 : -ENODEV; 446 + } 447 + 448 + static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, 449 + struct id_table_entry *entry_b) 450 + { 451 + struct rdma_id_private *id_priv = list_first_entry( 452 + &entry_b->id_list, struct rdma_id_private, id_list_entry); 453 + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; 454 + struct sockaddr *sb = cma_dst_addr(id_priv); 455 + 456 + if (ifindex_a != ifindex_b) 457 + return (ifindex_a > ifindex_b) ? 1 : -1; 458 + 459 + if (sa->sa_family != sb->sa_family) 460 + return sa->sa_family - sb->sa_family; 461 + 462 + if (sa->sa_family == AF_INET) 463 + return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr, 464 + (char *)&((struct sockaddr_in *)sb)->sin_addr, 465 + sizeof(((struct sockaddr_in *)sa)->sin_addr)); 466 + 467 + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, 468 + &((struct sockaddr_in6 *)sb)->sin6_addr); 469 + } 470 + 471 + static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) 472 + { 473 + struct rb_node **new, *parent = NULL; 474 + struct id_table_entry *this, *node; 475 + unsigned long flags; 476 + int result; 477 + 478 + node = kzalloc(sizeof(*node), GFP_KERNEL); 479 + if (!node) 480 + return -ENOMEM; 481 + 482 + spin_lock_irqsave(&id_table_lock, flags); 483 + new = &id_table.rb_node; 484 + while (*new) { 485 + this = container_of(*new, struct id_table_entry, rb_node); 486 + result = compare_netdev_and_ip( 487 + node_id_priv->id.route.addr.dev_addr.bound_dev_if, 488 + cma_dst_addr(node_id_priv), this); 489 + 490 + parent = *new; 491 + if (result < 0) 492 + new = &((*new)->rb_left); 493 + else if (result > 0) 494 + new = &((*new)->rb_right); 495 + else { 496 + list_add_tail(&node_id_priv->id_list_entry, 497 + &this->id_list); 498 + kfree(node); 499 + goto unlock; 500 + } 501 + } 502 + 503 + INIT_LIST_HEAD(&node->id_list); 504 + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); 505 + 506 + rb_link_node(&node->rb_node, parent, new); 507 + rb_insert_color(&node->rb_node, &id_table); 508 + 509 + unlock: 510 + spin_unlock_irqrestore(&id_table_lock, flags); 511 + return 0; 512 + } 513 + 514 + static struct id_table_entry * 515 + node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) 516 + { 517 + struct rb_node *node = root->rb_node; 518 + struct id_table_entry *data; 519 + int result; 520 + 521 + while (node) { 522 + data = container_of(node, struct id_table_entry, rb_node); 523 + result = compare_netdev_and_ip(ifindex, sa, data); 524 + if (result < 0) 525 + node = node->rb_left; 526 + else if (result > 0) 527 + node = node->rb_right; 528 + else 529 + return data; 530 + } 531 + 532 + return NULL; 533 + } 534 + 535 + static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) 536 + { 537 + struct id_table_entry *data; 538 + unsigned long flags; 539 + 540 + spin_lock_irqsave(&id_table_lock, flags); 541 + if (list_empty(&id_priv->id_list_entry)) 542 + goto out; 543 + 544 + data = node_from_ndev_ip(&id_table, 545 + id_priv->id.route.addr.dev_addr.bound_dev_if, 546 + cma_dst_addr(id_priv)); 547 + if (!data) 548 + goto out; 549 + 550 + list_del_init(&id_priv->id_list_entry); 551 + if (list_empty(&data->id_list)) { 552 + rb_erase(&data->rb_node, &id_table); 553 + kfree(data); 554 + } 555 + out: 556 + spin_unlock_irqrestore(&id_table_lock, flags); 466 557 } 467 558 468 559 static void _cma_attach_to_dev(struct rdma_id_private *id_priv, ··· 610 479 id_priv->id.route.addr.dev_addr.sgid_attr = NULL; 611 480 } 612 481 mutex_unlock(&lock); 613 - } 614 - 615 - static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) 616 - { 617 - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; 618 - } 619 - 620 - static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) 621 - { 622 - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; 623 482 } 624 483 625 484 static inline unsigned short cma_family(struct rdma_id_private *id_priv) ··· 982 861 refcount_set(&id_priv->refcount, 1); 983 862 mutex_init(&id_priv->handler_mutex); 984 863 INIT_LIST_HEAD(&id_priv->device_item); 864 + INIT_LIST_HEAD(&id_priv->id_list_entry); 985 865 INIT_LIST_HEAD(&id_priv->listen_list); 986 866 INIT_LIST_HEAD(&id_priv->mc_list); 987 867 get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); ··· 2005 1883 cma_cancel_operation(id_priv, state); 2006 1884 2007 1885 rdma_restrack_del(&id_priv->res); 1886 + cma_remove_id_from_tree(id_priv); 2008 1887 if (id_priv->cma_dev) { 2009 1888 if (rdma_cap_ib_cm(id_priv->id.device, 1)) { 2010 1889 if (id_priv->cm_id.ib) ··· 3295 3172 cma_id_get(id_priv); 3296 3173 if (rdma_cap_ib_sa(id->device, id->port_num)) 3297 3174 ret = cma_resolve_ib_route(id_priv, timeout_ms); 3298 - else if (rdma_protocol_roce(id->device, id->port_num)) 3175 + else if (rdma_protocol_roce(id->device, id->port_num)) { 3299 3176 ret = cma_resolve_iboe_route(id_priv); 3177 + if (!ret) 3178 + cma_add_id_to_tree(id_priv); 3179 + } 3300 3180 else if (rdma_protocol_iwarp(id->device, id->port_num)) 3301 3181 ret = cma_resolve_iw_route(id_priv); 3302 3182 else ··· 5048 4922 return ret; 5049 4923 } 5050 4924 4925 + static void cma_netevent_work_handler(struct work_struct *_work) 4926 + { 4927 + struct rdma_id_private *id_priv = 4928 + container_of(_work, struct rdma_id_private, id.net_work); 4929 + struct rdma_cm_event event = {}; 4930 + 4931 + mutex_lock(&id_priv->handler_mutex); 4932 + 4933 + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || 4934 + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) 4935 + goto out_unlock; 4936 + 4937 + event.event = RDMA_CM_EVENT_UNREACHABLE; 4938 + event.status = -ETIMEDOUT; 4939 + 4940 + if (cma_cm_event_handler(id_priv, &event)) { 4941 + __acquire(&id_priv->handler_mutex); 4942 + id_priv->cm_id.ib = NULL; 4943 + cma_id_put(id_priv); 4944 + destroy_id_handler_unlock(id_priv); 4945 + return; 4946 + } 4947 + 4948 + out_unlock: 4949 + mutex_unlock(&id_priv->handler_mutex); 4950 + cma_id_put(id_priv); 4951 + } 4952 + 4953 + static int cma_netevent_callback(struct notifier_block *self, 4954 + unsigned long event, void *ctx) 4955 + { 4956 + struct id_table_entry *ips_node = NULL; 4957 + struct rdma_id_private *current_id; 4958 + struct neighbour *neigh = ctx; 4959 + unsigned long flags; 4960 + 4961 + if (event != NETEVENT_NEIGH_UPDATE) 4962 + return NOTIFY_DONE; 4963 + 4964 + spin_lock_irqsave(&id_table_lock, flags); 4965 + if (neigh->tbl->family == AF_INET6) { 4966 + struct sockaddr_in6 neigh_sock_6; 4967 + 4968 + neigh_sock_6.sin6_family = AF_INET6; 4969 + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; 4970 + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, 4971 + (struct sockaddr *)&neigh_sock_6); 4972 + } else if (neigh->tbl->family == AF_INET) { 4973 + struct sockaddr_in neigh_sock_4; 4974 + 4975 + neigh_sock_4.sin_family = AF_INET; 4976 + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); 4977 + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, 4978 + (struct sockaddr *)&neigh_sock_4); 4979 + } else 4980 + goto out; 4981 + 4982 + if (!ips_node) 4983 + goto out; 4984 + 4985 + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { 4986 + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, 4987 + neigh->ha, ETH_ALEN)) 4988 + continue; 4989 + INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler); 4990 + cma_id_get(current_id); 4991 + queue_work(cma_wq, &current_id->id.net_work); 4992 + } 4993 + out: 4994 + spin_unlock_irqrestore(&id_table_lock, flags); 4995 + return NOTIFY_DONE; 4996 + } 4997 + 5051 4998 static struct notifier_block cma_nb = { 5052 4999 .notifier_call = cma_netdev_callback 5000 + }; 5001 + 5002 + static struct notifier_block cma_netevent_cb = { 5003 + .notifier_call = cma_netevent_callback 5053 5004 }; 5054 5005 5055 5006 static void cma_send_device_removal_put(struct rdma_id_private *id_priv) ··· 5351 5148 5352 5149 ib_sa_register_client(&sa_client); 5353 5150 register_netdevice_notifier(&cma_nb); 5151 + register_netevent_notifier(&cma_netevent_cb); 5354 5152 5355 5153 ret = ib_register_client(&cma_client); 5356 5154 if (ret) ··· 5366 5162 err_ib: 5367 5163 ib_unregister_client(&cma_client); 5368 5164 err: 5165 + unregister_netevent_notifier(&cma_netevent_cb); 5369 5166 unregister_netdevice_notifier(&cma_nb); 5370 5167 ib_sa_unregister_client(&sa_client); 5371 5168 unregister_pernet_subsys(&cma_pernet_operations); ··· 5379 5174 { 5380 5175 cma_configfs_exit(); 5381 5176 ib_unregister_client(&cma_client); 5177 + unregister_netevent_notifier(&cma_netevent_cb); 5382 5178 unregister_netdevice_notifier(&cma_nb); 5383 5179 ib_sa_unregister_client(&sa_client); 5384 5180 unregister_pernet_subsys(&cma_pernet_operations);
+1
drivers/infiniband/core/cma_priv.h
··· 64 64 struct list_head listen_item; 65 65 struct list_head listen_list; 66 66 }; 67 + struct list_head id_list_entry; 67 68 struct cma_device *cma_dev; 68 69 struct list_head mc_list; 69 70
+1 -1
drivers/infiniband/core/rdma_core.c
··· 68 68 * In exclusive access mode, we check that the counter is zero (nobody 69 69 * claimed this object) and we set it to -1. Releasing a shared access 70 70 * lock is done simply by decreasing the counter. As for exclusive 71 - * access locks, since only a single one of them is is allowed 71 + * access locks, since only a single one of them is allowed 72 72 * concurrently, setting the counter to zero is enough for releasing 73 73 * this lock. 74 74 */
+1 -1
drivers/infiniband/core/roce_gid_mgmt.c
··· 250 250 251 251 /** 252 252 * is_upper_ndev_bond_master_filter - Check if a given netdevice 253 - * is bond master device of netdevice of the the RDMA device of port. 253 + * is bond master device of netdevice of the RDMA device of port. 254 254 * @ib_dev: IB device to check 255 255 * @port: Port to consider for adding default GID 256 256 * @rdma_ndev: Pointer to rdma netdevice
+1
drivers/infiniband/hw/Makefile
··· 13 13 obj-$(CONFIG_INFINIBAND_HNS) += hns/ 14 14 obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ 15 15 obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ 16 + obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/
+1 -1
drivers/infiniband/hw/bnxt_re/bnxt_re.h
··· 173 173 /* Max of 2 lossless traffic class supported per port */ 174 174 u16 cosq[2]; 175 175 176 - /* QP for for handling QP1 packets */ 176 + /* QP for handling QP1 packets */ 177 177 struct bnxt_re_gsi_context gsi_ctx; 178 178 struct bnxt_re_stats stats; 179 179 atomic_t nq_alloc_cnt;
+12
drivers/infiniband/hw/erdma/Kconfig
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + config INFINIBAND_ERDMA 3 + tristate "Alibaba Elastic RDMA Adapter (ERDMA) support" 4 + depends on PCI_MSI && 64BIT 5 + depends on INFINIBAND_ADDR_TRANS 6 + depends on INFINIBAND_USER_ACCESS 7 + help 8 + This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), 9 + which supports RDMA features in Alibaba cloud environment. 10 + 11 + To compile this driver as module, choose M here. The module will be 12 + called erdma.
+4
drivers/infiniband/hw/erdma/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o 3 + 4 + erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o
+287
drivers/infiniband/hw/erdma/erdma.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #ifndef __ERDMA_H__ 8 + #define __ERDMA_H__ 9 + 10 + #include <linux/bitfield.h> 11 + #include <linux/netdevice.h> 12 + #include <linux/xarray.h> 13 + #include <rdma/ib_verbs.h> 14 + 15 + #include "erdma_hw.h" 16 + 17 + #define DRV_MODULE_NAME "erdma" 18 + #define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" 19 + 20 + struct erdma_eq { 21 + void *qbuf; 22 + dma_addr_t qbuf_dma_addr; 23 + 24 + spinlock_t lock; 25 + 26 + u32 depth; 27 + 28 + u16 ci; 29 + u16 rsvd; 30 + 31 + atomic64_t event_num; 32 + atomic64_t notify_num; 33 + 34 + u64 __iomem *db_addr; 35 + u64 *db_record; 36 + }; 37 + 38 + struct erdma_cmdq_sq { 39 + void *qbuf; 40 + dma_addr_t qbuf_dma_addr; 41 + 42 + spinlock_t lock; 43 + 44 + u32 depth; 45 + u16 ci; 46 + u16 pi; 47 + 48 + u16 wqebb_cnt; 49 + 50 + u64 *db_record; 51 + }; 52 + 53 + struct erdma_cmdq_cq { 54 + void *qbuf; 55 + dma_addr_t qbuf_dma_addr; 56 + 57 + spinlock_t lock; 58 + 59 + u32 depth; 60 + u32 ci; 61 + u32 cmdsn; 62 + 63 + u64 *db_record; 64 + 65 + atomic64_t armed_num; 66 + }; 67 + 68 + enum { 69 + ERDMA_CMD_STATUS_INIT, 70 + ERDMA_CMD_STATUS_ISSUED, 71 + ERDMA_CMD_STATUS_FINISHED, 72 + ERDMA_CMD_STATUS_TIMEOUT 73 + }; 74 + 75 + struct erdma_comp_wait { 76 + struct completion wait_event; 77 + u32 cmd_status; 78 + u32 ctx_id; 79 + u16 sq_pi; 80 + u8 comp_status; 81 + u8 rsvd; 82 + u32 comp_data[4]; 83 + }; 84 + 85 + enum { 86 + ERDMA_CMDQ_STATE_OK_BIT = 0, 87 + ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1, 88 + ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2, 89 + }; 90 + 91 + #define ERDMA_CMDQ_TIMEOUT_MS 15000 92 + #define ERDMA_REG_ACCESS_WAIT_MS 20 93 + #define ERDMA_WAIT_DEV_DONE_CNT 500 94 + 95 + struct erdma_cmdq { 96 + unsigned long *comp_wait_bitmap; 97 + struct erdma_comp_wait *wait_pool; 98 + spinlock_t lock; 99 + 100 + bool use_event; 101 + 102 + struct erdma_cmdq_sq sq; 103 + struct erdma_cmdq_cq cq; 104 + struct erdma_eq eq; 105 + 106 + unsigned long state; 107 + 108 + struct semaphore credits; 109 + u16 max_outstandings; 110 + }; 111 + 112 + #define COMPROMISE_CC ERDMA_CC_CUBIC 113 + enum erdma_cc_alg { 114 + ERDMA_CC_NEWRENO = 0, 115 + ERDMA_CC_CUBIC, 116 + ERDMA_CC_HPCC_RTT, 117 + ERDMA_CC_HPCC_ECN, 118 + ERDMA_CC_HPCC_INT, 119 + ERDMA_CC_METHODS_NUM 120 + }; 121 + 122 + struct erdma_devattr { 123 + u32 fw_version; 124 + 125 + unsigned char peer_addr[ETH_ALEN]; 126 + 127 + int numa_node; 128 + enum erdma_cc_alg cc; 129 + u32 grp_num; 130 + u32 irq_num; 131 + 132 + bool disable_dwqe; 133 + u16 dwqe_pages; 134 + u16 dwqe_entries; 135 + 136 + u32 max_qp; 137 + u32 max_send_wr; 138 + u32 max_recv_wr; 139 + u32 max_ord; 140 + u32 max_ird; 141 + 142 + u32 max_send_sge; 143 + u32 max_recv_sge; 144 + u32 max_sge_rd; 145 + u32 max_cq; 146 + u32 max_cqe; 147 + u64 max_mr_size; 148 + u32 max_mr; 149 + u32 max_pd; 150 + u32 max_mw; 151 + u32 local_dma_key; 152 + }; 153 + 154 + #define ERDMA_IRQNAME_SIZE 50 155 + 156 + struct erdma_irq { 157 + char name[ERDMA_IRQNAME_SIZE]; 158 + u32 msix_vector; 159 + cpumask_t affinity_hint_mask; 160 + }; 161 + 162 + struct erdma_eq_cb { 163 + bool ready; 164 + void *dev; /* All EQs use this fields to get erdma_dev struct */ 165 + struct erdma_irq irq; 166 + struct erdma_eq eq; 167 + struct tasklet_struct tasklet; 168 + }; 169 + 170 + struct erdma_resource_cb { 171 + unsigned long *bitmap; 172 + spinlock_t lock; 173 + u32 next_alloc_idx; 174 + u32 max_cap; 175 + }; 176 + 177 + enum { 178 + ERDMA_RES_TYPE_PD = 0, 179 + ERDMA_RES_TYPE_STAG_IDX = 1, 180 + ERDMA_RES_CNT = 2, 181 + }; 182 + 183 + #define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE 184 + #define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) 185 + 186 + struct erdma_dev { 187 + struct ib_device ibdev; 188 + struct net_device *netdev; 189 + struct pci_dev *pdev; 190 + struct notifier_block netdev_nb; 191 + 192 + resource_size_t func_bar_addr; 193 + resource_size_t func_bar_len; 194 + u8 __iomem *func_bar; 195 + 196 + struct erdma_devattr attrs; 197 + /* physical port state (only one port per device) */ 198 + enum ib_port_state state; 199 + 200 + /* cmdq and aeq use the same msix vector */ 201 + struct erdma_irq comm_irq; 202 + struct erdma_cmdq cmdq; 203 + struct erdma_eq aeq; 204 + struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1]; 205 + 206 + spinlock_t lock; 207 + struct erdma_resource_cb res_cb[ERDMA_RES_CNT]; 208 + struct xarray qp_xa; 209 + struct xarray cq_xa; 210 + 211 + u32 next_alloc_qpn; 212 + u32 next_alloc_cqn; 213 + 214 + spinlock_t db_bitmap_lock; 215 + /* We provide max 64 uContexts that each has one SQ doorbell Page. */ 216 + DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); 217 + /* 218 + * We provide max 496 uContexts that each has one SQ normal Db, 219 + * and one directWQE db。 220 + */ 221 + DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); 222 + 223 + atomic_t num_ctx; 224 + struct list_head cep_list; 225 + }; 226 + 227 + static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) 228 + { 229 + idx &= (depth - 1); 230 + 231 + return qbuf + (idx << shift); 232 + } 233 + 234 + static inline struct erdma_dev *to_edev(struct ib_device *ibdev) 235 + { 236 + return container_of(ibdev, struct erdma_dev, ibdev); 237 + } 238 + 239 + static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg) 240 + { 241 + return readl(dev->func_bar + reg); 242 + } 243 + 244 + static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg) 245 + { 246 + return readq(dev->func_bar + reg); 247 + } 248 + 249 + static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value) 250 + { 251 + writel(value, dev->func_bar + reg); 252 + } 253 + 254 + static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value) 255 + { 256 + writeq(value, dev->func_bar + reg); 257 + } 258 + 259 + static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, 260 + u32 filed_mask) 261 + { 262 + u32 val = erdma_reg_read32(dev, reg); 263 + 264 + return FIELD_GET(filed_mask, val); 265 + } 266 + 267 + int erdma_cmdq_init(struct erdma_dev *dev); 268 + void erdma_finish_cmdq_init(struct erdma_dev *dev); 269 + void erdma_cmdq_destroy(struct erdma_dev *dev); 270 + 271 + void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); 272 + int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, 273 + u64 *resp0, u64 *resp1); 274 + void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); 275 + 276 + int erdma_ceqs_init(struct erdma_dev *dev); 277 + void erdma_ceqs_uninit(struct erdma_dev *dev); 278 + void notify_eq(struct erdma_eq *eq); 279 + void *get_next_valid_eqe(struct erdma_eq *eq); 280 + 281 + int erdma_aeq_init(struct erdma_dev *dev); 282 + void erdma_aeq_destroy(struct erdma_dev *dev); 283 + 284 + void erdma_aeq_event_handler(struct erdma_dev *dev); 285 + void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); 286 + 287 + #endif
+1430
drivers/infiniband/hw/erdma/erdma_cm.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 8 + /* Fredy Neeser */ 9 + /* Greg Joyce <greg@opengridcomputing.com> */ 10 + /* Copyright (c) 2008-2019, IBM Corporation */ 11 + /* Copyright (c) 2017, Open Grid Computing, Inc. */ 12 + 13 + #include <linux/errno.h> 14 + #include <linux/inetdevice.h> 15 + #include <linux/net.h> 16 + #include <linux/types.h> 17 + #include <linux/workqueue.h> 18 + #include <net/addrconf.h> 19 + 20 + #include <rdma/ib_user_verbs.h> 21 + #include <rdma/ib_verbs.h> 22 + 23 + #include "erdma.h" 24 + #include "erdma_cm.h" 25 + #include "erdma_verbs.h" 26 + 27 + static struct workqueue_struct *erdma_cm_wq; 28 + 29 + static void erdma_cm_llp_state_change(struct sock *sk); 30 + static void erdma_cm_llp_data_ready(struct sock *sk); 31 + static void erdma_cm_llp_error_report(struct sock *sk); 32 + 33 + static void erdma_sk_assign_cm_upcalls(struct sock *sk) 34 + { 35 + write_lock_bh(&sk->sk_callback_lock); 36 + sk->sk_state_change = erdma_cm_llp_state_change; 37 + sk->sk_data_ready = erdma_cm_llp_data_ready; 38 + sk->sk_error_report = erdma_cm_llp_error_report; 39 + write_unlock_bh(&sk->sk_callback_lock); 40 + } 41 + 42 + static void erdma_sk_save_upcalls(struct sock *sk) 43 + { 44 + struct erdma_cep *cep = sk_to_cep(sk); 45 + 46 + write_lock_bh(&sk->sk_callback_lock); 47 + cep->sk_state_change = sk->sk_state_change; 48 + cep->sk_data_ready = sk->sk_data_ready; 49 + cep->sk_error_report = sk->sk_error_report; 50 + write_unlock_bh(&sk->sk_callback_lock); 51 + } 52 + 53 + static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep) 54 + { 55 + sk->sk_state_change = cep->sk_state_change; 56 + sk->sk_data_ready = cep->sk_data_ready; 57 + sk->sk_error_report = cep->sk_error_report; 58 + sk->sk_user_data = NULL; 59 + } 60 + 61 + static void erdma_socket_disassoc(struct socket *s) 62 + { 63 + struct sock *sk = s->sk; 64 + struct erdma_cep *cep; 65 + 66 + if (sk) { 67 + write_lock_bh(&sk->sk_callback_lock); 68 + cep = sk_to_cep(sk); 69 + if (cep) { 70 + erdma_sk_restore_upcalls(sk, cep); 71 + erdma_cep_put(cep); 72 + } else { 73 + WARN_ON_ONCE(1); 74 + } 75 + write_unlock_bh(&sk->sk_callback_lock); 76 + } else { 77 + WARN_ON_ONCE(1); 78 + } 79 + } 80 + 81 + static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s) 82 + { 83 + cep->sock = s; 84 + erdma_cep_get(cep); 85 + s->sk->sk_user_data = cep; 86 + 87 + erdma_sk_save_upcalls(s->sk); 88 + erdma_sk_assign_cm_upcalls(s->sk); 89 + } 90 + 91 + static void erdma_disassoc_listen_cep(struct erdma_cep *cep) 92 + { 93 + if (cep->listen_cep) { 94 + erdma_cep_put(cep->listen_cep); 95 + cep->listen_cep = NULL; 96 + } 97 + } 98 + 99 + static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev) 100 + { 101 + struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); 102 + unsigned long flags; 103 + 104 + if (!cep) 105 + return NULL; 106 + 107 + INIT_LIST_HEAD(&cep->listenq); 108 + INIT_LIST_HEAD(&cep->devq); 109 + INIT_LIST_HEAD(&cep->work_freelist); 110 + 111 + kref_init(&cep->ref); 112 + cep->state = ERDMA_EPSTATE_IDLE; 113 + init_waitqueue_head(&cep->waitq); 114 + spin_lock_init(&cep->lock); 115 + cep->dev = dev; 116 + 117 + spin_lock_irqsave(&dev->lock, flags); 118 + list_add_tail(&cep->devq, &dev->cep_list); 119 + spin_unlock_irqrestore(&dev->lock, flags); 120 + 121 + return cep; 122 + } 123 + 124 + static void erdma_cm_free_work(struct erdma_cep *cep) 125 + { 126 + struct list_head *w, *tmp; 127 + struct erdma_cm_work *work; 128 + 129 + list_for_each_safe(w, tmp, &cep->work_freelist) { 130 + work = list_entry(w, struct erdma_cm_work, list); 131 + list_del(&work->list); 132 + kfree(work); 133 + } 134 + } 135 + 136 + static void erdma_cancel_mpatimer(struct erdma_cep *cep) 137 + { 138 + spin_lock_bh(&cep->lock); 139 + if (cep->mpa_timer) { 140 + if (cancel_delayed_work(&cep->mpa_timer->work)) { 141 + erdma_cep_put(cep); 142 + kfree(cep->mpa_timer); 143 + } 144 + cep->mpa_timer = NULL; 145 + } 146 + spin_unlock_bh(&cep->lock); 147 + } 148 + 149 + static void erdma_put_work(struct erdma_cm_work *work) 150 + { 151 + INIT_LIST_HEAD(&work->list); 152 + spin_lock_bh(&work->cep->lock); 153 + list_add(&work->list, &work->cep->work_freelist); 154 + spin_unlock_bh(&work->cep->lock); 155 + } 156 + 157 + static void erdma_cep_set_inuse(struct erdma_cep *cep) 158 + { 159 + unsigned long flags; 160 + 161 + spin_lock_irqsave(&cep->lock, flags); 162 + while (cep->in_use) { 163 + spin_unlock_irqrestore(&cep->lock, flags); 164 + wait_event_interruptible(cep->waitq, !cep->in_use); 165 + if (signal_pending(current)) 166 + flush_signals(current); 167 + 168 + spin_lock_irqsave(&cep->lock, flags); 169 + } 170 + 171 + cep->in_use = 1; 172 + spin_unlock_irqrestore(&cep->lock, flags); 173 + } 174 + 175 + static void erdma_cep_set_free(struct erdma_cep *cep) 176 + { 177 + unsigned long flags; 178 + 179 + spin_lock_irqsave(&cep->lock, flags); 180 + cep->in_use = 0; 181 + spin_unlock_irqrestore(&cep->lock, flags); 182 + 183 + wake_up(&cep->waitq); 184 + } 185 + 186 + static void __erdma_cep_dealloc(struct kref *ref) 187 + { 188 + struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref); 189 + struct erdma_dev *dev = cep->dev; 190 + unsigned long flags; 191 + 192 + WARN_ON(cep->listen_cep); 193 + 194 + kfree(cep->private_data); 195 + kfree(cep->mpa.pdata); 196 + spin_lock_bh(&cep->lock); 197 + if (!list_empty(&cep->work_freelist)) 198 + erdma_cm_free_work(cep); 199 + spin_unlock_bh(&cep->lock); 200 + 201 + spin_lock_irqsave(&dev->lock, flags); 202 + list_del(&cep->devq); 203 + spin_unlock_irqrestore(&dev->lock, flags); 204 + kfree(cep); 205 + } 206 + 207 + static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep) 208 + { 209 + struct erdma_cm_work *work = NULL; 210 + 211 + spin_lock_bh(&cep->lock); 212 + if (!list_empty(&cep->work_freelist)) { 213 + work = list_entry(cep->work_freelist.next, struct erdma_cm_work, 214 + list); 215 + list_del_init(&work->list); 216 + } 217 + 218 + spin_unlock_bh(&cep->lock); 219 + return work; 220 + } 221 + 222 + static int erdma_cm_alloc_work(struct erdma_cep *cep, int num) 223 + { 224 + struct erdma_cm_work *work; 225 + 226 + while (num--) { 227 + work = kmalloc(sizeof(*work), GFP_KERNEL); 228 + if (!work) { 229 + if (!(list_empty(&cep->work_freelist))) 230 + erdma_cm_free_work(cep); 231 + return -ENOMEM; 232 + } 233 + work->cep = cep; 234 + INIT_LIST_HEAD(&work->list); 235 + list_add(&work->list, &cep->work_freelist); 236 + } 237 + 238 + return 0; 239 + } 240 + 241 + static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason, 242 + int status) 243 + { 244 + struct iw_cm_event event; 245 + struct iw_cm_id *cm_id; 246 + 247 + memset(&event, 0, sizeof(event)); 248 + event.status = status; 249 + event.event = reason; 250 + 251 + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { 252 + event.provider_data = cep; 253 + cm_id = cep->listen_cep->cm_id; 254 + 255 + event.ird = cep->dev->attrs.max_ird; 256 + event.ord = cep->dev->attrs.max_ord; 257 + } else { 258 + cm_id = cep->cm_id; 259 + } 260 + 261 + if (reason == IW_CM_EVENT_CONNECT_REQUEST || 262 + reason == IW_CM_EVENT_CONNECT_REPLY) { 263 + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); 264 + 265 + if (pd_len && cep->mpa.pdata) { 266 + event.private_data_len = pd_len; 267 + event.private_data = cep->mpa.pdata; 268 + } 269 + 270 + getname_local(cep->sock, &event.local_addr); 271 + getname_peer(cep->sock, &event.remote_addr); 272 + } 273 + 274 + return cm_id->event_handler(cm_id, &event); 275 + } 276 + 277 + void erdma_qp_cm_drop(struct erdma_qp *qp) 278 + { 279 + struct erdma_cep *cep = qp->cep; 280 + 281 + if (!qp->cep) 282 + return; 283 + 284 + erdma_cep_set_inuse(cep); 285 + 286 + /* already closed. */ 287 + if (cep->state == ERDMA_EPSTATE_CLOSED) 288 + goto out; 289 + 290 + if (cep->cm_id) { 291 + switch (cep->state) { 292 + case ERDMA_EPSTATE_AWAIT_MPAREP: 293 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 294 + -EINVAL); 295 + break; 296 + case ERDMA_EPSTATE_RDMA_MODE: 297 + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 298 + break; 299 + case ERDMA_EPSTATE_IDLE: 300 + case ERDMA_EPSTATE_LISTENING: 301 + case ERDMA_EPSTATE_CONNECTING: 302 + case ERDMA_EPSTATE_AWAIT_MPAREQ: 303 + case ERDMA_EPSTATE_RECVD_MPAREQ: 304 + case ERDMA_EPSTATE_CLOSED: 305 + default: 306 + break; 307 + } 308 + cep->cm_id->rem_ref(cep->cm_id); 309 + cep->cm_id = NULL; 310 + erdma_cep_put(cep); 311 + } 312 + cep->state = ERDMA_EPSTATE_CLOSED; 313 + 314 + if (cep->sock) { 315 + erdma_socket_disassoc(cep->sock); 316 + sock_release(cep->sock); 317 + cep->sock = NULL; 318 + } 319 + 320 + if (cep->qp) { 321 + cep->qp = NULL; 322 + erdma_qp_put(qp); 323 + } 324 + out: 325 + erdma_cep_set_free(cep); 326 + } 327 + 328 + void erdma_cep_put(struct erdma_cep *cep) 329 + { 330 + WARN_ON(kref_read(&cep->ref) < 1); 331 + kref_put(&cep->ref, __erdma_cep_dealloc); 332 + } 333 + 334 + void erdma_cep_get(struct erdma_cep *cep) 335 + { 336 + kref_get(&cep->ref); 337 + } 338 + 339 + static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata, 340 + u8 pd_len) 341 + { 342 + struct socket *s = cep->sock; 343 + struct mpa_rr *rr = &cep->mpa.hdr; 344 + struct kvec iov[3]; 345 + struct msghdr msg; 346 + int iovec_num = 0; 347 + int ret; 348 + int mpa_len; 349 + 350 + memset(&msg, 0, sizeof(msg)); 351 + 352 + rr->params.pd_len = cpu_to_be16(pd_len); 353 + 354 + iov[iovec_num].iov_base = rr; 355 + iov[iovec_num].iov_len = sizeof(*rr); 356 + iovec_num++; 357 + mpa_len = sizeof(*rr); 358 + 359 + iov[iovec_num].iov_base = &cep->mpa.ext_data; 360 + iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data); 361 + iovec_num++; 362 + mpa_len += sizeof(cep->mpa.ext_data); 363 + 364 + if (pd_len) { 365 + iov[iovec_num].iov_base = (char *)pdata; 366 + iov[iovec_num].iov_len = pd_len; 367 + mpa_len += pd_len; 368 + iovec_num++; 369 + } 370 + 371 + ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len); 372 + 373 + return ret < 0 ? ret : 0; 374 + } 375 + 376 + static inline int ksock_recv(struct socket *sock, char *buf, size_t size, 377 + int flags) 378 + { 379 + struct kvec iov = { buf, size }; 380 + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; 381 + 382 + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); 383 + } 384 + 385 + static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr, 386 + int hdr_size, int *rcvd_out) 387 + { 388 + struct socket *s = cep->sock; 389 + int rcvd; 390 + 391 + *rcvd_out = 0; 392 + if (hdr_rcvd < hdr_size) { 393 + rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd, 394 + MSG_DONTWAIT); 395 + if (rcvd == -EAGAIN) 396 + return -EAGAIN; 397 + 398 + if (rcvd <= 0) 399 + return -ECONNABORTED; 400 + 401 + hdr_rcvd += rcvd; 402 + *rcvd_out = rcvd; 403 + 404 + if (hdr_rcvd < hdr_size) 405 + return -EAGAIN; 406 + } 407 + 408 + return 0; 409 + } 410 + 411 + static void __mpa_rr_set_revision(__be16 *bits, u8 rev) 412 + { 413 + *bits = (*bits & ~MPA_RR_MASK_REVISION) | 414 + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); 415 + } 416 + 417 + static u8 __mpa_rr_revision(__be16 mpa_rr_bits) 418 + { 419 + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; 420 + 421 + return (u8)be16_to_cpu(rev); 422 + } 423 + 424 + static void __mpa_ext_set_cc(__be32 *bits, u32 cc) 425 + { 426 + *bits = (*bits & ~MPA_EXT_FLAG_CC) | 427 + (cpu_to_be32(cc) & MPA_EXT_FLAG_CC); 428 + } 429 + 430 + static u8 __mpa_ext_cc(__be32 mpa_ext_bits) 431 + { 432 + __be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC; 433 + 434 + return (u8)be32_to_cpu(cc); 435 + } 436 + 437 + /* 438 + * Receive MPA Request/Reply header. 439 + * 440 + * Returns 0 if complete MPA Request/Reply haeder including 441 + * eventual private data was received. Returns -EAGAIN if 442 + * header was partially received or negative error code otherwise. 443 + * 444 + * Context: May be called in process context only 445 + */ 446 + static int erdma_recv_mpa_rr(struct erdma_cep *cep) 447 + { 448 + struct mpa_rr *hdr = &cep->mpa.hdr; 449 + struct socket *s = cep->sock; 450 + u16 pd_len; 451 + int rcvd, to_rcv, ret, pd_rcvd; 452 + 453 + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { 454 + ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd, 455 + (char *)&cep->mpa.hdr, 456 + sizeof(struct mpa_rr), &rcvd); 457 + cep->mpa.bytes_rcvd += rcvd; 458 + if (ret) 459 + return ret; 460 + } 461 + 462 + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA || 463 + __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1) 464 + return -EPROTO; 465 + 466 + if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) < 467 + sizeof(struct erdma_mpa_ext)) { 468 + ret = __recv_mpa_hdr( 469 + cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), 470 + (char *)&cep->mpa.ext_data, 471 + sizeof(struct erdma_mpa_ext), &rcvd); 472 + cep->mpa.bytes_rcvd += rcvd; 473 + if (ret) 474 + return ret; 475 + } 476 + 477 + pd_len = be16_to_cpu(hdr->params.pd_len); 478 + pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) - 479 + sizeof(struct erdma_mpa_ext); 480 + to_rcv = pd_len - pd_rcvd; 481 + 482 + if (!to_rcv) { 483 + /* 484 + * We have received the whole MPA Request/Reply message. 485 + * Check against peer protocol violation. 486 + */ 487 + u32 word; 488 + 489 + ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word), 490 + &rcvd); 491 + if (ret == -EAGAIN && rcvd == 0) 492 + return 0; 493 + 494 + if (ret) 495 + return ret; 496 + 497 + return -EPROTO; 498 + } 499 + 500 + /* 501 + * At this point, MPA header has been fully received, and pd_len != 0. 502 + * So, begin to receive private data. 503 + */ 504 + if (!cep->mpa.pdata) { 505 + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); 506 + if (!cep->mpa.pdata) 507 + return -ENOMEM; 508 + } 509 + 510 + rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4, 511 + MSG_DONTWAIT); 512 + if (rcvd < 0) 513 + return rcvd; 514 + 515 + if (rcvd > to_rcv) 516 + return -EPROTO; 517 + 518 + cep->mpa.bytes_rcvd += rcvd; 519 + 520 + if (to_rcv == rcvd) 521 + return 0; 522 + 523 + return -EAGAIN; 524 + } 525 + 526 + /* 527 + * erdma_proc_mpareq() 528 + * 529 + * Read MPA Request from socket and signal new connection to IWCM 530 + * if success. Caller must hold lock on corresponding listening CEP. 531 + */ 532 + static int erdma_proc_mpareq(struct erdma_cep *cep) 533 + { 534 + struct mpa_rr *req; 535 + int ret; 536 + 537 + ret = erdma_recv_mpa_rr(cep); 538 + if (ret) 539 + return ret; 540 + 541 + req = &cep->mpa.hdr; 542 + 543 + if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE)) 544 + return -EPROTO; 545 + 546 + memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE); 547 + 548 + /* Currently does not support marker and crc. */ 549 + if (req->params.bits & MPA_RR_FLAG_MARKERS || 550 + req->params.bits & MPA_RR_FLAG_CRC) 551 + goto reject_conn; 552 + 553 + cep->state = ERDMA_EPSTATE_RECVD_MPAREQ; 554 + 555 + /* Keep reference until IWCM accepts/rejects */ 556 + erdma_cep_get(cep); 557 + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); 558 + if (ret) 559 + erdma_cep_put(cep); 560 + 561 + return ret; 562 + 563 + reject_conn: 564 + req->params.bits &= ~MPA_RR_FLAG_MARKERS; 565 + req->params.bits |= MPA_RR_FLAG_REJECT; 566 + req->params.bits &= ~MPA_RR_FLAG_CRC; 567 + 568 + kfree(cep->mpa.pdata); 569 + cep->mpa.pdata = NULL; 570 + erdma_send_mpareqrep(cep, NULL, 0); 571 + 572 + return -EOPNOTSUPP; 573 + } 574 + 575 + static int erdma_proc_mpareply(struct erdma_cep *cep) 576 + { 577 + struct erdma_qp_attrs qp_attrs; 578 + struct erdma_qp *qp = cep->qp; 579 + struct mpa_rr *rep; 580 + int ret; 581 + 582 + ret = erdma_recv_mpa_rr(cep); 583 + if (ret) 584 + goto out_err; 585 + 586 + erdma_cancel_mpatimer(cep); 587 + 588 + rep = &cep->mpa.hdr; 589 + 590 + if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) { 591 + ret = -EPROTO; 592 + goto out_err; 593 + } 594 + 595 + if (rep->params.bits & MPA_RR_FLAG_REJECT) { 596 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); 597 + return -ECONNRESET; 598 + } 599 + 600 + /* Currently does not support marker and crc. */ 601 + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || 602 + (rep->params.bits & MPA_RR_FLAG_CRC)) { 603 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); 604 + return -EINVAL; 605 + } 606 + 607 + memset(&qp_attrs, 0, sizeof(qp_attrs)); 608 + qp_attrs.irq_size = cep->ird; 609 + qp_attrs.orq_size = cep->ord; 610 + qp_attrs.state = ERDMA_QP_STATE_RTS; 611 + 612 + down_write(&qp->state_lock); 613 + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { 614 + ret = -EINVAL; 615 + up_write(&qp->state_lock); 616 + goto out_err; 617 + } 618 + 619 + qp->attrs.qp_type = ERDMA_QP_ACTIVE; 620 + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) 621 + qp->attrs.cc = COMPROMISE_CC; 622 + 623 + ret = erdma_modify_qp_internal(qp, &qp_attrs, 624 + ERDMA_QP_ATTR_STATE | 625 + ERDMA_QP_ATTR_LLP_HANDLE | 626 + ERDMA_QP_ATTR_MPA); 627 + 628 + up_write(&qp->state_lock); 629 + 630 + if (!ret) { 631 + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); 632 + if (!ret) 633 + cep->state = ERDMA_EPSTATE_RDMA_MODE; 634 + 635 + return 0; 636 + } 637 + 638 + out_err: 639 + if (ret != -EAGAIN) 640 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); 641 + 642 + return ret; 643 + } 644 + 645 + static void erdma_accept_newconn(struct erdma_cep *cep) 646 + { 647 + struct socket *s = cep->sock; 648 + struct socket *new_s = NULL; 649 + struct erdma_cep *new_cep = NULL; 650 + int ret = 0; 651 + 652 + if (cep->state != ERDMA_EPSTATE_LISTENING) 653 + goto error; 654 + 655 + new_cep = erdma_cep_alloc(cep->dev); 656 + if (!new_cep) 657 + goto error; 658 + 659 + /* 660 + * 4: Allocate a sufficient number of work elements 661 + * to allow concurrent handling of local + peer close 662 + * events, MPA header processing + MPA timeout. 663 + */ 664 + if (erdma_cm_alloc_work(new_cep, 4) != 0) 665 + goto error; 666 + 667 + /* 668 + * Copy saved socket callbacks from listening CEP 669 + * and assign new socket with new CEP 670 + */ 671 + new_cep->sk_state_change = cep->sk_state_change; 672 + new_cep->sk_data_ready = cep->sk_data_ready; 673 + new_cep->sk_error_report = cep->sk_error_report; 674 + 675 + ret = kernel_accept(s, &new_s, O_NONBLOCK); 676 + if (ret != 0) 677 + goto error; 678 + 679 + new_cep->sock = new_s; 680 + erdma_cep_get(new_cep); 681 + new_s->sk->sk_user_data = new_cep; 682 + 683 + tcp_sock_set_nodelay(new_s->sk); 684 + new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ; 685 + 686 + ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT); 687 + if (ret) 688 + goto error; 689 + 690 + new_cep->listen_cep = cep; 691 + erdma_cep_get(cep); 692 + 693 + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { 694 + /* MPA REQ already queued */ 695 + erdma_cep_set_inuse(new_cep); 696 + ret = erdma_proc_mpareq(new_cep); 697 + if (ret != -EAGAIN) { 698 + erdma_cep_put(cep); 699 + new_cep->listen_cep = NULL; 700 + if (ret) { 701 + erdma_cep_set_free(new_cep); 702 + goto error; 703 + } 704 + } 705 + erdma_cep_set_free(new_cep); 706 + } 707 + return; 708 + 709 + error: 710 + if (new_cep) { 711 + new_cep->state = ERDMA_EPSTATE_CLOSED; 712 + erdma_cancel_mpatimer(new_cep); 713 + 714 + erdma_cep_put(new_cep); 715 + new_cep->sock = NULL; 716 + } 717 + 718 + if (new_s) { 719 + erdma_socket_disassoc(new_s); 720 + sock_release(new_s); 721 + } 722 + } 723 + 724 + static int erdma_newconn_connected(struct erdma_cep *cep) 725 + { 726 + int ret = 0; 727 + 728 + cep->mpa.hdr.params.bits = 0; 729 + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); 730 + 731 + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); 732 + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); 733 + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); 734 + 735 + ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); 736 + cep->state = ERDMA_EPSTATE_AWAIT_MPAREP; 737 + cep->mpa.hdr.params.pd_len = 0; 738 + 739 + if (ret >= 0) 740 + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT); 741 + 742 + return ret; 743 + } 744 + 745 + static void erdma_cm_work_handler(struct work_struct *w) 746 + { 747 + struct erdma_cm_work *work; 748 + struct erdma_cep *cep; 749 + int release_cep = 0, ret = 0; 750 + 751 + work = container_of(w, struct erdma_cm_work, work.work); 752 + cep = work->cep; 753 + 754 + erdma_cep_set_inuse(cep); 755 + 756 + switch (work->type) { 757 + case ERDMA_CM_WORK_CONNECTED: 758 + erdma_cancel_mpatimer(cep); 759 + if (cep->state == ERDMA_EPSTATE_CONNECTING) { 760 + ret = erdma_newconn_connected(cep); 761 + if (ret) { 762 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 763 + -EIO); 764 + release_cep = 1; 765 + } 766 + } 767 + break; 768 + case ERDMA_CM_WORK_CONNECTTIMEOUT: 769 + if (cep->state == ERDMA_EPSTATE_CONNECTING) { 770 + cep->mpa_timer = NULL; 771 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 772 + -ETIMEDOUT); 773 + release_cep = 1; 774 + } 775 + break; 776 + case ERDMA_CM_WORK_ACCEPT: 777 + erdma_accept_newconn(cep); 778 + break; 779 + case ERDMA_CM_WORK_READ_MPAHDR: 780 + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { 781 + if (cep->listen_cep) { 782 + erdma_cep_set_inuse(cep->listen_cep); 783 + 784 + if (cep->listen_cep->state == 785 + ERDMA_EPSTATE_LISTENING) 786 + ret = erdma_proc_mpareq(cep); 787 + else 788 + ret = -EFAULT; 789 + 790 + erdma_cep_set_free(cep->listen_cep); 791 + 792 + if (ret != -EAGAIN) { 793 + erdma_cep_put(cep->listen_cep); 794 + cep->listen_cep = NULL; 795 + if (ret) 796 + erdma_cep_put(cep); 797 + } 798 + } 799 + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { 800 + ret = erdma_proc_mpareply(cep); 801 + } 802 + 803 + if (ret && ret != -EAGAIN) 804 + release_cep = 1; 805 + break; 806 + case ERDMA_CM_WORK_CLOSE_LLP: 807 + if (cep->cm_id) 808 + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 809 + release_cep = 1; 810 + break; 811 + case ERDMA_CM_WORK_PEER_CLOSE: 812 + if (cep->cm_id) { 813 + if (cep->state == ERDMA_EPSTATE_CONNECTING || 814 + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { 815 + /* 816 + * MPA reply not received, but connection drop 817 + */ 818 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 819 + -ECONNRESET); 820 + } else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) { 821 + /* 822 + * NOTE: IW_CM_EVENT_DISCONNECT is given just 823 + * to transition IWCM into CLOSING. 824 + */ 825 + erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); 826 + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); 827 + } 828 + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { 829 + /* Socket close before MPA request received. */ 830 + erdma_disassoc_listen_cep(cep); 831 + erdma_cep_put(cep); 832 + } 833 + release_cep = 1; 834 + break; 835 + case ERDMA_CM_WORK_MPATIMEOUT: 836 + cep->mpa_timer = NULL; 837 + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { 838 + /* 839 + * MPA request timed out: 840 + * Hide any partially received private data and signal 841 + * timeout 842 + */ 843 + cep->mpa.hdr.params.pd_len = 0; 844 + 845 + if (cep->cm_id) 846 + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 847 + -ETIMEDOUT); 848 + release_cep = 1; 849 + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { 850 + /* No MPA req received after peer TCP stream setup. */ 851 + erdma_disassoc_listen_cep(cep); 852 + 853 + erdma_cep_put(cep); 854 + release_cep = 1; 855 + } 856 + break; 857 + default: 858 + WARN(1, "Undefined CM work type: %d\n", work->type); 859 + } 860 + 861 + if (release_cep) { 862 + erdma_cancel_mpatimer(cep); 863 + cep->state = ERDMA_EPSTATE_CLOSED; 864 + if (cep->qp) { 865 + struct erdma_qp *qp = cep->qp; 866 + /* 867 + * Serialize a potential race with application 868 + * closing the QP and calling erdma_qp_cm_drop() 869 + */ 870 + erdma_qp_get(qp); 871 + erdma_cep_set_free(cep); 872 + 873 + erdma_qp_llp_close(qp); 874 + erdma_qp_put(qp); 875 + 876 + erdma_cep_set_inuse(cep); 877 + cep->qp = NULL; 878 + erdma_qp_put(qp); 879 + } 880 + 881 + if (cep->sock) { 882 + erdma_socket_disassoc(cep->sock); 883 + sock_release(cep->sock); 884 + cep->sock = NULL; 885 + } 886 + 887 + if (cep->cm_id) { 888 + cep->cm_id->rem_ref(cep->cm_id); 889 + cep->cm_id = NULL; 890 + if (cep->state != ERDMA_EPSTATE_LISTENING) 891 + erdma_cep_put(cep); 892 + } 893 + } 894 + erdma_cep_set_free(cep); 895 + erdma_put_work(work); 896 + erdma_cep_put(cep); 897 + } 898 + 899 + int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type) 900 + { 901 + struct erdma_cm_work *work = erdma_get_work(cep); 902 + unsigned long delay = 0; 903 + 904 + if (!work) 905 + return -ENOMEM; 906 + 907 + work->type = type; 908 + work->cep = cep; 909 + 910 + erdma_cep_get(cep); 911 + 912 + INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler); 913 + 914 + if (type == ERDMA_CM_WORK_MPATIMEOUT) { 915 + cep->mpa_timer = work; 916 + 917 + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) 918 + delay = MPAREP_TIMEOUT; 919 + else 920 + delay = MPAREQ_TIMEOUT; 921 + } else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) { 922 + cep->mpa_timer = work; 923 + 924 + delay = CONNECT_TIMEOUT; 925 + } 926 + 927 + queue_delayed_work(erdma_cm_wq, &work->work, delay); 928 + 929 + return 0; 930 + } 931 + 932 + static void erdma_cm_llp_data_ready(struct sock *sk) 933 + { 934 + struct erdma_cep *cep; 935 + 936 + read_lock(&sk->sk_callback_lock); 937 + 938 + cep = sk_to_cep(sk); 939 + if (!cep) 940 + goto out; 941 + 942 + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ || 943 + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) 944 + erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR); 945 + 946 + out: 947 + read_unlock(&sk->sk_callback_lock); 948 + } 949 + 950 + static void erdma_cm_llp_error_report(struct sock *sk) 951 + { 952 + struct erdma_cep *cep = sk_to_cep(sk); 953 + 954 + if (cep) 955 + cep->sk_error_report(sk); 956 + } 957 + 958 + static void erdma_cm_llp_state_change(struct sock *sk) 959 + { 960 + struct erdma_cep *cep; 961 + void (*orig_state_change)(struct sock *sk); 962 + 963 + read_lock(&sk->sk_callback_lock); 964 + 965 + cep = sk_to_cep(sk); 966 + if (!cep) { 967 + read_unlock(&sk->sk_callback_lock); 968 + return; 969 + } 970 + orig_state_change = cep->sk_state_change; 971 + 972 + switch (sk->sk_state) { 973 + case TCP_ESTABLISHED: 974 + if (cep->state == ERDMA_EPSTATE_CONNECTING) 975 + erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); 976 + else 977 + erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT); 978 + break; 979 + case TCP_CLOSE: 980 + case TCP_CLOSE_WAIT: 981 + if (cep->state != ERDMA_EPSTATE_LISTENING) 982 + erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE); 983 + break; 984 + default: 985 + break; 986 + } 987 + read_unlock(&sk->sk_callback_lock); 988 + orig_state_change(sk); 989 + } 990 + 991 + static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, 992 + int laddrlen, struct sockaddr *raddr, 993 + int raddrlen, int flags) 994 + { 995 + int ret; 996 + 997 + sock_set_reuseaddr(s->sk); 998 + ret = s->ops->bind(s, laddr, laddrlen); 999 + if (ret) 1000 + return ret; 1001 + ret = s->ops->connect(s, raddr, raddrlen, flags); 1002 + return ret < 0 ? ret : 0; 1003 + } 1004 + 1005 + int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) 1006 + { 1007 + struct erdma_dev *dev = to_edev(id->device); 1008 + struct erdma_qp *qp; 1009 + struct erdma_cep *cep = NULL; 1010 + struct socket *s = NULL; 1011 + struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr; 1012 + struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr; 1013 + u16 pd_len = params->private_data_len; 1014 + int ret; 1015 + 1016 + if (pd_len > MPA_MAX_PRIVDATA) 1017 + return -EINVAL; 1018 + 1019 + if (params->ird > dev->attrs.max_ird || 1020 + params->ord > dev->attrs.max_ord) 1021 + return -EINVAL; 1022 + 1023 + if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET) 1024 + return -EAFNOSUPPORT; 1025 + 1026 + qp = find_qp_by_qpn(dev, params->qpn); 1027 + if (!qp) 1028 + return -ENOENT; 1029 + erdma_qp_get(qp); 1030 + 1031 + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s); 1032 + if (ret < 0) 1033 + goto error_put_qp; 1034 + 1035 + cep = erdma_cep_alloc(dev); 1036 + if (!cep) { 1037 + ret = -ENOMEM; 1038 + goto error_release_sock; 1039 + } 1040 + 1041 + erdma_cep_set_inuse(cep); 1042 + 1043 + /* Associate QP with CEP */ 1044 + erdma_cep_get(cep); 1045 + qp->cep = cep; 1046 + cep->qp = qp; 1047 + 1048 + /* Associate cm_id with CEP */ 1049 + id->add_ref(id); 1050 + cep->cm_id = id; 1051 + 1052 + /* 1053 + * 6: Allocate a sufficient number of work elements 1054 + * to allow concurrent handling of local + peer close 1055 + * events, MPA header processing + MPA timeout, connected event 1056 + * and connect timeout. 1057 + */ 1058 + ret = erdma_cm_alloc_work(cep, 6); 1059 + if (ret != 0) { 1060 + ret = -ENOMEM; 1061 + goto error_release_cep; 1062 + } 1063 + 1064 + cep->ird = params->ird; 1065 + cep->ord = params->ord; 1066 + cep->state = ERDMA_EPSTATE_CONNECTING; 1067 + 1068 + erdma_cep_socket_assoc(cep, s); 1069 + 1070 + if (pd_len) { 1071 + cep->pd_len = pd_len; 1072 + cep->private_data = kmalloc(pd_len, GFP_KERNEL); 1073 + if (!cep->private_data) { 1074 + ret = -ENOMEM; 1075 + goto error_disassoc; 1076 + } 1077 + 1078 + memcpy(cep->private_data, params->private_data, 1079 + params->private_data_len); 1080 + } 1081 + 1082 + ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr, 1083 + sizeof(*raddr), O_NONBLOCK); 1084 + if (ret != -EINPROGRESS && ret != 0) { 1085 + goto error_disassoc; 1086 + } else if (ret == 0) { 1087 + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); 1088 + if (ret) 1089 + goto error_disassoc; 1090 + } else { 1091 + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT); 1092 + if (ret) 1093 + goto error_disassoc; 1094 + } 1095 + 1096 + erdma_cep_set_free(cep); 1097 + return 0; 1098 + 1099 + error_disassoc: 1100 + kfree(cep->private_data); 1101 + cep->private_data = NULL; 1102 + cep->pd_len = 0; 1103 + 1104 + erdma_socket_disassoc(s); 1105 + 1106 + error_release_cep: 1107 + /* disassoc with cm_id */ 1108 + cep->cm_id = NULL; 1109 + id->rem_ref(id); 1110 + 1111 + /* disassoc with qp */ 1112 + qp->cep = NULL; 1113 + erdma_cep_put(cep); 1114 + cep->qp = NULL; 1115 + 1116 + cep->state = ERDMA_EPSTATE_CLOSED; 1117 + 1118 + erdma_cep_set_free(cep); 1119 + 1120 + /* release the cep. */ 1121 + erdma_cep_put(cep); 1122 + 1123 + error_release_sock: 1124 + if (s) 1125 + sock_release(s); 1126 + error_put_qp: 1127 + erdma_qp_put(qp); 1128 + 1129 + return ret; 1130 + } 1131 + 1132 + int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) 1133 + { 1134 + struct erdma_dev *dev = to_edev(id->device); 1135 + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; 1136 + struct erdma_qp *qp; 1137 + struct erdma_qp_attrs qp_attrs; 1138 + int ret; 1139 + 1140 + erdma_cep_set_inuse(cep); 1141 + erdma_cep_put(cep); 1142 + 1143 + /* Free lingering inbound private data */ 1144 + if (cep->mpa.hdr.params.pd_len) { 1145 + cep->mpa.hdr.params.pd_len = 0; 1146 + kfree(cep->mpa.pdata); 1147 + cep->mpa.pdata = NULL; 1148 + } 1149 + erdma_cancel_mpatimer(cep); 1150 + 1151 + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { 1152 + erdma_cep_set_free(cep); 1153 + erdma_cep_put(cep); 1154 + 1155 + return -ECONNRESET; 1156 + } 1157 + 1158 + qp = find_qp_by_qpn(dev, params->qpn); 1159 + if (!qp) 1160 + return -ENOENT; 1161 + erdma_qp_get(qp); 1162 + 1163 + down_write(&qp->state_lock); 1164 + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { 1165 + ret = -EINVAL; 1166 + up_write(&qp->state_lock); 1167 + goto error; 1168 + } 1169 + 1170 + if (params->ord > dev->attrs.max_ord || 1171 + params->ird > dev->attrs.max_ord) { 1172 + ret = -EINVAL; 1173 + up_write(&qp->state_lock); 1174 + goto error; 1175 + } 1176 + 1177 + if (params->private_data_len > MPA_MAX_PRIVDATA) { 1178 + ret = -EINVAL; 1179 + up_write(&qp->state_lock); 1180 + goto error; 1181 + } 1182 + 1183 + cep->ird = params->ird; 1184 + cep->ord = params->ord; 1185 + 1186 + cep->cm_id = id; 1187 + id->add_ref(id); 1188 + 1189 + memset(&qp_attrs, 0, sizeof(qp_attrs)); 1190 + qp_attrs.orq_size = params->ord; 1191 + qp_attrs.irq_size = params->ird; 1192 + 1193 + qp_attrs.state = ERDMA_QP_STATE_RTS; 1194 + 1195 + /* Associate QP with CEP */ 1196 + erdma_cep_get(cep); 1197 + qp->cep = cep; 1198 + cep->qp = qp; 1199 + 1200 + cep->state = ERDMA_EPSTATE_RDMA_MODE; 1201 + 1202 + qp->attrs.qp_type = ERDMA_QP_PASSIVE; 1203 + qp->attrs.pd_len = params->private_data_len; 1204 + 1205 + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) 1206 + qp->attrs.cc = COMPROMISE_CC; 1207 + 1208 + /* move to rts */ 1209 + ret = erdma_modify_qp_internal(qp, &qp_attrs, 1210 + ERDMA_QP_ATTR_STATE | 1211 + ERDMA_QP_ATTR_ORD | 1212 + ERDMA_QP_ATTR_LLP_HANDLE | 1213 + ERDMA_QP_ATTR_IRD | 1214 + ERDMA_QP_ATTR_MPA); 1215 + up_write(&qp->state_lock); 1216 + 1217 + if (ret) 1218 + goto error; 1219 + 1220 + cep->mpa.ext_data.bits = 0; 1221 + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); 1222 + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); 1223 + 1224 + ret = erdma_send_mpareqrep(cep, params->private_data, 1225 + params->private_data_len); 1226 + if (!ret) { 1227 + ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); 1228 + if (ret) 1229 + goto error; 1230 + 1231 + erdma_cep_set_free(cep); 1232 + 1233 + return 0; 1234 + } 1235 + 1236 + error: 1237 + erdma_socket_disassoc(cep->sock); 1238 + sock_release(cep->sock); 1239 + cep->sock = NULL; 1240 + 1241 + cep->state = ERDMA_EPSTATE_CLOSED; 1242 + 1243 + if (cep->cm_id) { 1244 + cep->cm_id->rem_ref(id); 1245 + cep->cm_id = NULL; 1246 + } 1247 + 1248 + if (qp->cep) { 1249 + erdma_cep_put(cep); 1250 + qp->cep = NULL; 1251 + } 1252 + 1253 + cep->qp = NULL; 1254 + erdma_qp_put(qp); 1255 + 1256 + erdma_cep_set_free(cep); 1257 + erdma_cep_put(cep); 1258 + 1259 + return ret; 1260 + } 1261 + 1262 + int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen) 1263 + { 1264 + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; 1265 + 1266 + erdma_cep_set_inuse(cep); 1267 + erdma_cep_put(cep); 1268 + 1269 + erdma_cancel_mpatimer(cep); 1270 + 1271 + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { 1272 + erdma_cep_set_free(cep); 1273 + erdma_cep_put(cep); 1274 + 1275 + return -ECONNRESET; 1276 + } 1277 + 1278 + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) { 1279 + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ 1280 + erdma_send_mpareqrep(cep, pdata, plen); 1281 + } 1282 + 1283 + erdma_socket_disassoc(cep->sock); 1284 + sock_release(cep->sock); 1285 + cep->sock = NULL; 1286 + 1287 + cep->state = ERDMA_EPSTATE_CLOSED; 1288 + 1289 + erdma_cep_set_free(cep); 1290 + erdma_cep_put(cep); 1291 + 1292 + return 0; 1293 + } 1294 + 1295 + int erdma_create_listen(struct iw_cm_id *id, int backlog) 1296 + { 1297 + struct socket *s; 1298 + struct erdma_cep *cep = NULL; 1299 + int ret = 0; 1300 + struct erdma_dev *dev = to_edev(id->device); 1301 + int addr_family = id->local_addr.ss_family; 1302 + struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); 1303 + 1304 + if (addr_family != AF_INET) 1305 + return -EAFNOSUPPORT; 1306 + 1307 + ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); 1308 + if (ret < 0) 1309 + return ret; 1310 + 1311 + sock_set_reuseaddr(s->sk); 1312 + 1313 + /* For wildcard addr, limit binding to current device only */ 1314 + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) 1315 + s->sk->sk_bound_dev_if = dev->netdev->ifindex; 1316 + 1317 + ret = s->ops->bind(s, (struct sockaddr *)laddr, 1318 + sizeof(struct sockaddr_in)); 1319 + if (ret) 1320 + goto error; 1321 + 1322 + cep = erdma_cep_alloc(dev); 1323 + if (!cep) { 1324 + ret = -ENOMEM; 1325 + goto error; 1326 + } 1327 + erdma_cep_socket_assoc(cep, s); 1328 + 1329 + ret = erdma_cm_alloc_work(cep, backlog); 1330 + if (ret) 1331 + goto error; 1332 + 1333 + ret = s->ops->listen(s, backlog); 1334 + if (ret) 1335 + goto error; 1336 + 1337 + cep->cm_id = id; 1338 + id->add_ref(id); 1339 + 1340 + if (!id->provider_data) { 1341 + id->provider_data = 1342 + kmalloc(sizeof(struct list_head), GFP_KERNEL); 1343 + if (!id->provider_data) { 1344 + ret = -ENOMEM; 1345 + goto error; 1346 + } 1347 + INIT_LIST_HEAD((struct list_head *)id->provider_data); 1348 + } 1349 + 1350 + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); 1351 + cep->state = ERDMA_EPSTATE_LISTENING; 1352 + 1353 + return 0; 1354 + 1355 + error: 1356 + if (cep) { 1357 + erdma_cep_set_inuse(cep); 1358 + 1359 + if (cep->cm_id) { 1360 + cep->cm_id->rem_ref(cep->cm_id); 1361 + cep->cm_id = NULL; 1362 + } 1363 + cep->sock = NULL; 1364 + erdma_socket_disassoc(s); 1365 + cep->state = ERDMA_EPSTATE_CLOSED; 1366 + 1367 + erdma_cep_set_free(cep); 1368 + erdma_cep_put(cep); 1369 + } 1370 + sock_release(s); 1371 + 1372 + return ret; 1373 + } 1374 + 1375 + static void erdma_drop_listeners(struct iw_cm_id *id) 1376 + { 1377 + struct list_head *p, *tmp; 1378 + /* 1379 + * In case of a wildcard rdma_listen on a multi-homed device, 1380 + * a listener's IWCM id is associated with more than one listening CEP. 1381 + */ 1382 + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { 1383 + struct erdma_cep *cep = 1384 + list_entry(p, struct erdma_cep, listenq); 1385 + 1386 + list_del(p); 1387 + 1388 + erdma_cep_set_inuse(cep); 1389 + 1390 + if (cep->cm_id) { 1391 + cep->cm_id->rem_ref(cep->cm_id); 1392 + cep->cm_id = NULL; 1393 + } 1394 + if (cep->sock) { 1395 + erdma_socket_disassoc(cep->sock); 1396 + sock_release(cep->sock); 1397 + cep->sock = NULL; 1398 + } 1399 + cep->state = ERDMA_EPSTATE_CLOSED; 1400 + erdma_cep_set_free(cep); 1401 + erdma_cep_put(cep); 1402 + } 1403 + } 1404 + 1405 + int erdma_destroy_listen(struct iw_cm_id *id) 1406 + { 1407 + if (!id->provider_data) 1408 + return 0; 1409 + 1410 + erdma_drop_listeners(id); 1411 + kfree(id->provider_data); 1412 + id->provider_data = NULL; 1413 + 1414 + return 0; 1415 + } 1416 + 1417 + int erdma_cm_init(void) 1418 + { 1419 + erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq"); 1420 + if (!erdma_cm_wq) 1421 + return -ENOMEM; 1422 + 1423 + return 0; 1424 + } 1425 + 1426 + void erdma_cm_exit(void) 1427 + { 1428 + if (erdma_cm_wq) 1429 + destroy_workqueue(erdma_cm_wq); 1430 + }
+167
drivers/infiniband/hw/erdma/erdma_cm.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 8 + /* Greg Joyce <greg@opengridcomputing.com> */ 9 + /* Copyright (c) 2008-2019, IBM Corporation */ 10 + /* Copyright (c) 2017, Open Grid Computing, Inc. */ 11 + 12 + #ifndef __ERDMA_CM_H__ 13 + #define __ERDMA_CM_H__ 14 + 15 + #include <linux/tcp.h> 16 + #include <net/sock.h> 17 + #include <rdma/iw_cm.h> 18 + 19 + /* iWarp MPA protocol defs */ 20 + #define MPA_REVISION_EXT_1 129 21 + #define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA 22 + #define MPA_KEY_REQ "MPA ID Req Frame" 23 + #define MPA_KEY_REP "MPA ID Rep Frame" 24 + #define MPA_KEY_SIZE 16 25 + #define MPA_DEFAULT_HDR_LEN 28 26 + 27 + struct mpa_rr_params { 28 + __be16 bits; 29 + __be16 pd_len; 30 + }; 31 + 32 + /* 33 + * MPA request/response Hdr bits & fields 34 + */ 35 + enum { 36 + MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000), 37 + MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000), 38 + MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000), 39 + MPA_RR_RESERVED = __cpu_to_be16(0x1f00), 40 + MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff) 41 + }; 42 + 43 + /* 44 + * MPA request/reply header 45 + */ 46 + struct mpa_rr { 47 + u8 key[16]; 48 + struct mpa_rr_params params; 49 + }; 50 + 51 + struct erdma_mpa_ext { 52 + __be32 cookie; 53 + __be32 bits; 54 + }; 55 + 56 + enum { 57 + MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f), 58 + }; 59 + 60 + struct erdma_mpa_info { 61 + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ 62 + struct erdma_mpa_ext ext_data; 63 + char *pdata; 64 + int bytes_rcvd; 65 + }; 66 + 67 + struct erdma_sk_upcalls { 68 + void (*sk_state_change)(struct sock *sk); 69 + void (*sk_data_ready)(struct sock *sk, int bytes); 70 + void (*sk_error_report)(struct sock *sk); 71 + }; 72 + 73 + struct erdma_dev; 74 + 75 + enum erdma_cep_state { 76 + ERDMA_EPSTATE_IDLE = 1, 77 + ERDMA_EPSTATE_LISTENING, 78 + ERDMA_EPSTATE_CONNECTING, 79 + ERDMA_EPSTATE_AWAIT_MPAREQ, 80 + ERDMA_EPSTATE_RECVD_MPAREQ, 81 + ERDMA_EPSTATE_AWAIT_MPAREP, 82 + ERDMA_EPSTATE_RDMA_MODE, 83 + ERDMA_EPSTATE_CLOSED 84 + }; 85 + 86 + struct erdma_cep { 87 + struct iw_cm_id *cm_id; 88 + struct erdma_dev *dev; 89 + struct list_head devq; 90 + spinlock_t lock; 91 + struct kref ref; 92 + int in_use; 93 + wait_queue_head_t waitq; 94 + enum erdma_cep_state state; 95 + 96 + struct list_head listenq; 97 + struct erdma_cep *listen_cep; 98 + 99 + struct erdma_qp *qp; 100 + struct socket *sock; 101 + 102 + struct erdma_cm_work *mpa_timer; 103 + struct list_head work_freelist; 104 + 105 + struct erdma_mpa_info mpa; 106 + int ord; 107 + int ird; 108 + 109 + int pd_len; 110 + /* hold user's private data. */ 111 + void *private_data; 112 + 113 + /* Saved upcalls of socket llp.sock */ 114 + void (*sk_state_change)(struct sock *sk); 115 + void (*sk_data_ready)(struct sock *sk); 116 + void (*sk_error_report)(struct sock *sk); 117 + }; 118 + 119 + #define MPAREQ_TIMEOUT (HZ * 20) 120 + #define MPAREP_TIMEOUT (HZ * 10) 121 + #define CONNECT_TIMEOUT (HZ * 10) 122 + 123 + enum erdma_work_type { 124 + ERDMA_CM_WORK_ACCEPT = 1, 125 + ERDMA_CM_WORK_READ_MPAHDR, 126 + ERDMA_CM_WORK_CLOSE_LLP, /* close socket */ 127 + ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ 128 + ERDMA_CM_WORK_MPATIMEOUT, 129 + ERDMA_CM_WORK_CONNECTED, 130 + ERDMA_CM_WORK_CONNECTTIMEOUT 131 + }; 132 + 133 + struct erdma_cm_work { 134 + struct delayed_work work; 135 + struct list_head list; 136 + enum erdma_work_type type; 137 + struct erdma_cep *cep; 138 + }; 139 + 140 + #define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) 141 + 142 + static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) 143 + { 144 + return s->ops->getname(s, (struct sockaddr *)a, 1); 145 + } 146 + 147 + static inline int getname_local(struct socket *s, struct sockaddr_storage *a) 148 + { 149 + return s->ops->getname(s, (struct sockaddr *)a, 0); 150 + } 151 + 152 + int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param); 153 + int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); 154 + int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen); 155 + int erdma_create_listen(struct iw_cm_id *id, int backlog); 156 + int erdma_destroy_listen(struct iw_cm_id *id); 157 + 158 + void erdma_cep_get(struct erdma_cep *ceq); 159 + void erdma_cep_put(struct erdma_cep *ceq); 160 + int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type); 161 + 162 + int erdma_cm_init(void); 163 + void erdma_cm_exit(void); 164 + 165 + #define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data)) 166 + 167 + #endif
+493
drivers/infiniband/hw/erdma/erdma_cmdq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #include <linux/kernel.h> 8 + #include <linux/pci.h> 9 + #include <linux/types.h> 10 + 11 + #include "erdma.h" 12 + #include "erdma_hw.h" 13 + #include "erdma_verbs.h" 14 + 15 + static void arm_cmdq_cq(struct erdma_cmdq *cmdq) 16 + { 17 + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); 18 + u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) | 19 + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | 20 + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | 21 + FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); 22 + 23 + *cmdq->cq.db_record = db_data; 24 + writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); 25 + 26 + atomic64_inc(&cmdq->cq.armed_num); 27 + } 28 + 29 + static void kick_cmdq_db(struct erdma_cmdq *cmdq) 30 + { 31 + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); 32 + u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); 33 + 34 + *cmdq->sq.db_record = db_data; 35 + writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); 36 + } 37 + 38 + static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq) 39 + { 40 + int comp_idx; 41 + 42 + spin_lock(&cmdq->lock); 43 + comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap, 44 + cmdq->max_outstandings); 45 + if (comp_idx == cmdq->max_outstandings) { 46 + spin_unlock(&cmdq->lock); 47 + return ERR_PTR(-ENOMEM); 48 + } 49 + 50 + __set_bit(comp_idx, cmdq->comp_wait_bitmap); 51 + spin_unlock(&cmdq->lock); 52 + 53 + return &cmdq->wait_pool[comp_idx]; 54 + } 55 + 56 + static void put_comp_wait(struct erdma_cmdq *cmdq, 57 + struct erdma_comp_wait *comp_wait) 58 + { 59 + int used; 60 + 61 + cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT; 62 + spin_lock(&cmdq->lock); 63 + used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap); 64 + spin_unlock(&cmdq->lock); 65 + 66 + WARN_ON(!used); 67 + } 68 + 69 + static int erdma_cmdq_wait_res_init(struct erdma_dev *dev, 70 + struct erdma_cmdq *cmdq) 71 + { 72 + int i; 73 + 74 + cmdq->wait_pool = 75 + devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings, 76 + sizeof(struct erdma_comp_wait), GFP_KERNEL); 77 + if (!cmdq->wait_pool) 78 + return -ENOMEM; 79 + 80 + spin_lock_init(&cmdq->lock); 81 + cmdq->comp_wait_bitmap = devm_bitmap_zalloc( 82 + &dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL); 83 + if (!cmdq->comp_wait_bitmap) 84 + return -ENOMEM; 85 + 86 + for (i = 0; i < cmdq->max_outstandings; i++) { 87 + init_completion(&cmdq->wait_pool[i].wait_event); 88 + cmdq->wait_pool[i].ctx_id = i; 89 + } 90 + 91 + return 0; 92 + } 93 + 94 + static int erdma_cmdq_sq_init(struct erdma_dev *dev) 95 + { 96 + struct erdma_cmdq *cmdq = &dev->cmdq; 97 + struct erdma_cmdq_sq *sq = &cmdq->sq; 98 + u32 buf_size; 99 + 100 + sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); 101 + sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; 102 + 103 + buf_size = sq->depth << SQEBB_SHIFT; 104 + 105 + sq->qbuf = 106 + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 107 + &sq->qbuf_dma_addr, GFP_KERNEL); 108 + if (!sq->qbuf) 109 + return -ENOMEM; 110 + 111 + sq->db_record = (u64 *)(sq->qbuf + buf_size); 112 + 113 + spin_lock_init(&sq->lock); 114 + 115 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG, 116 + upper_32_bits(sq->qbuf_dma_addr)); 117 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, 118 + lower_32_bits(sq->qbuf_dma_addr)); 119 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); 120 + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, 121 + sq->qbuf_dma_addr + buf_size); 122 + 123 + return 0; 124 + } 125 + 126 + static int erdma_cmdq_cq_init(struct erdma_dev *dev) 127 + { 128 + struct erdma_cmdq *cmdq = &dev->cmdq; 129 + struct erdma_cmdq_cq *cq = &cmdq->cq; 130 + u32 buf_size; 131 + 132 + cq->depth = cmdq->sq.depth; 133 + buf_size = cq->depth << CQE_SHIFT; 134 + 135 + cq->qbuf = 136 + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 137 + &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); 138 + if (!cq->qbuf) 139 + return -ENOMEM; 140 + 141 + spin_lock_init(&cq->lock); 142 + 143 + cq->db_record = (u64 *)(cq->qbuf + buf_size); 144 + 145 + atomic64_set(&cq->armed_num, 0); 146 + 147 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG, 148 + upper_32_bits(cq->qbuf_dma_addr)); 149 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, 150 + lower_32_bits(cq->qbuf_dma_addr)); 151 + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, 152 + cq->qbuf_dma_addr + buf_size); 153 + 154 + return 0; 155 + } 156 + 157 + static int erdma_cmdq_eq_init(struct erdma_dev *dev) 158 + { 159 + struct erdma_cmdq *cmdq = &dev->cmdq; 160 + struct erdma_eq *eq = &cmdq->eq; 161 + u32 buf_size; 162 + 163 + eq->depth = cmdq->max_outstandings; 164 + buf_size = eq->depth << EQE_SHIFT; 165 + 166 + eq->qbuf = 167 + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 168 + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); 169 + if (!eq->qbuf) 170 + return -ENOMEM; 171 + 172 + spin_lock_init(&eq->lock); 173 + atomic64_set(&eq->event_num, 0); 174 + 175 + eq->db_addr = 176 + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG); 177 + eq->db_record = (u64 *)(eq->qbuf + buf_size); 178 + 179 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, 180 + upper_32_bits(eq->qbuf_dma_addr)); 181 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, 182 + lower_32_bits(eq->qbuf_dma_addr)); 183 + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); 184 + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, 185 + eq->qbuf_dma_addr + buf_size); 186 + 187 + return 0; 188 + } 189 + 190 + int erdma_cmdq_init(struct erdma_dev *dev) 191 + { 192 + int err, i; 193 + struct erdma_cmdq *cmdq = &dev->cmdq; 194 + u32 sts, ctrl; 195 + 196 + cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; 197 + cmdq->use_event = false; 198 + 199 + sema_init(&cmdq->credits, cmdq->max_outstandings); 200 + 201 + err = erdma_cmdq_wait_res_init(dev, cmdq); 202 + if (err) 203 + return err; 204 + 205 + err = erdma_cmdq_sq_init(dev); 206 + if (err) 207 + return err; 208 + 209 + err = erdma_cmdq_cq_init(dev); 210 + if (err) 211 + goto err_destroy_sq; 212 + 213 + err = erdma_cmdq_eq_init(dev); 214 + if (err) 215 + goto err_destroy_cq; 216 + 217 + ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1); 218 + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); 219 + 220 + for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) { 221 + sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, 222 + ERDMA_REG_DEV_ST_INIT_DONE_MASK); 223 + if (sts) 224 + break; 225 + 226 + msleep(ERDMA_REG_ACCESS_WAIT_MS); 227 + } 228 + 229 + if (i == ERDMA_WAIT_DEV_DONE_CNT) { 230 + dev_err(&dev->pdev->dev, "wait init done failed.\n"); 231 + err = -ETIMEDOUT; 232 + goto err_destroy_eq; 233 + } 234 + 235 + set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); 236 + 237 + return 0; 238 + 239 + err_destroy_eq: 240 + dma_free_coherent(&dev->pdev->dev, 241 + (cmdq->eq.depth << EQE_SHIFT) + 242 + ERDMA_EXTRA_BUFFER_SIZE, 243 + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); 244 + 245 + err_destroy_cq: 246 + dma_free_coherent(&dev->pdev->dev, 247 + (cmdq->cq.depth << CQE_SHIFT) + 248 + ERDMA_EXTRA_BUFFER_SIZE, 249 + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); 250 + 251 + err_destroy_sq: 252 + dma_free_coherent(&dev->pdev->dev, 253 + (cmdq->sq.depth << SQEBB_SHIFT) + 254 + ERDMA_EXTRA_BUFFER_SIZE, 255 + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); 256 + 257 + return err; 258 + } 259 + 260 + void erdma_finish_cmdq_init(struct erdma_dev *dev) 261 + { 262 + /* after device init successfully, change cmdq to event mode. */ 263 + dev->cmdq.use_event = true; 264 + arm_cmdq_cq(&dev->cmdq); 265 + } 266 + 267 + void erdma_cmdq_destroy(struct erdma_dev *dev) 268 + { 269 + struct erdma_cmdq *cmdq = &dev->cmdq; 270 + 271 + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); 272 + 273 + dma_free_coherent(&dev->pdev->dev, 274 + (cmdq->eq.depth << EQE_SHIFT) + 275 + ERDMA_EXTRA_BUFFER_SIZE, 276 + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); 277 + dma_free_coherent(&dev->pdev->dev, 278 + (cmdq->sq.depth << SQEBB_SHIFT) + 279 + ERDMA_EXTRA_BUFFER_SIZE, 280 + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); 281 + dma_free_coherent(&dev->pdev->dev, 282 + (cmdq->cq.depth << CQE_SHIFT) + 283 + ERDMA_EXTRA_BUFFER_SIZE, 284 + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); 285 + } 286 + 287 + static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) 288 + { 289 + __be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci, 290 + cmdq->cq.depth, CQE_SHIFT); 291 + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, 292 + __be32_to_cpu(READ_ONCE(*cqe))); 293 + 294 + return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL; 295 + } 296 + 297 + static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len, 298 + struct erdma_comp_wait *comp_wait) 299 + { 300 + __le64 *wqe; 301 + u64 hdr = *req; 302 + 303 + comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED; 304 + reinit_completion(&comp_wait->wait_event); 305 + comp_wait->sq_pi = cmdq->sq.pi; 306 + 307 + wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth, 308 + SQEBB_SHIFT); 309 + memcpy(wqe, req, req_len); 310 + 311 + cmdq->sq.pi += cmdq->sq.wqebb_cnt; 312 + hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) | 313 + FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, 314 + comp_wait->ctx_id) | 315 + FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1); 316 + *wqe = cpu_to_le64(hdr); 317 + 318 + kick_cmdq_db(cmdq); 319 + } 320 + 321 + static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) 322 + { 323 + struct erdma_comp_wait *comp_wait; 324 + u32 hdr0, sqe_idx; 325 + __be32 *cqe; 326 + u16 ctx_id; 327 + u64 *sqe; 328 + int i; 329 + 330 + cqe = get_next_valid_cmdq_cqe(cmdq); 331 + if (!cqe) 332 + return -EAGAIN; 333 + 334 + cmdq->cq.ci++; 335 + 336 + dma_rmb(); 337 + hdr0 = __be32_to_cpu(*cqe); 338 + sqe_idx = __be32_to_cpu(*(cqe + 1)); 339 + 340 + sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth, 341 + SQEBB_SHIFT); 342 + ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe); 343 + comp_wait = &cmdq->wait_pool[ctx_id]; 344 + if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED) 345 + return -EIO; 346 + 347 + comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED; 348 + comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0); 349 + cmdq->sq.ci += cmdq->sq.wqebb_cnt; 350 + 351 + for (i = 0; i < 4; i++) 352 + comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i)); 353 + 354 + if (cmdq->use_event) 355 + complete(&comp_wait->wait_event); 356 + 357 + return 0; 358 + } 359 + 360 + static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) 361 + { 362 + unsigned long flags; 363 + u16 comp_num; 364 + 365 + spin_lock_irqsave(&cmdq->cq.lock, flags); 366 + 367 + /* We must have less than # of max_outstandings 368 + * completions at one time. 369 + */ 370 + for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++) 371 + if (erdma_poll_single_cmd_completion(cmdq)) 372 + break; 373 + 374 + if (comp_num && cmdq->use_event) 375 + arm_cmdq_cq(cmdq); 376 + 377 + spin_unlock_irqrestore(&cmdq->cq.lock, flags); 378 + } 379 + 380 + void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) 381 + { 382 + int got_event = 0; 383 + 384 + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || 385 + !cmdq->use_event) 386 + return; 387 + 388 + while (get_next_valid_eqe(&cmdq->eq)) { 389 + cmdq->eq.ci++; 390 + got_event++; 391 + } 392 + 393 + if (got_event) { 394 + cmdq->cq.cmdsn++; 395 + erdma_polling_cmd_completions(cmdq); 396 + } 397 + 398 + notify_eq(&cmdq->eq); 399 + } 400 + 401 + static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, 402 + struct erdma_cmdq *cmdq, u32 timeout) 403 + { 404 + unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout); 405 + 406 + while (1) { 407 + erdma_polling_cmd_completions(cmdq); 408 + if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED) 409 + break; 410 + 411 + if (time_is_before_jiffies(comp_timeout)) 412 + return -ETIME; 413 + 414 + msleep(20); 415 + } 416 + 417 + return 0; 418 + } 419 + 420 + static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx, 421 + struct erdma_cmdq *cmdq, u32 timeout) 422 + { 423 + unsigned long flags = 0; 424 + 425 + wait_for_completion_timeout(&comp_ctx->wait_event, 426 + msecs_to_jiffies(timeout)); 427 + 428 + if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) { 429 + spin_lock_irqsave(&cmdq->cq.lock, flags); 430 + comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT; 431 + spin_unlock_irqrestore(&cmdq->cq.lock, flags); 432 + return -ETIME; 433 + } 434 + 435 + return 0; 436 + } 437 + 438 + void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) 439 + { 440 + *hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) | 441 + FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op); 442 + } 443 + 444 + int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, 445 + u64 *resp0, u64 *resp1) 446 + { 447 + struct erdma_comp_wait *comp_wait; 448 + int ret; 449 + 450 + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) 451 + return -ENODEV; 452 + 453 + down(&cmdq->credits); 454 + 455 + comp_wait = get_comp_wait(cmdq); 456 + if (IS_ERR(comp_wait)) { 457 + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); 458 + set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state); 459 + up(&cmdq->credits); 460 + return PTR_ERR(comp_wait); 461 + } 462 + 463 + spin_lock(&cmdq->sq.lock); 464 + push_cmdq_sqe(cmdq, req, req_size, comp_wait); 465 + spin_unlock(&cmdq->sq.lock); 466 + 467 + if (cmdq->use_event) 468 + ret = erdma_wait_cmd_completion(comp_wait, cmdq, 469 + ERDMA_CMDQ_TIMEOUT_MS); 470 + else 471 + ret = erdma_poll_cmd_completion(comp_wait, cmdq, 472 + ERDMA_CMDQ_TIMEOUT_MS); 473 + 474 + if (ret) { 475 + set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state); 476 + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); 477 + goto out; 478 + } 479 + 480 + if (comp_wait->comp_status) 481 + ret = -EIO; 482 + 483 + if (resp0 && resp1) { 484 + *resp0 = *((u64 *)&comp_wait->comp_data[0]); 485 + *resp1 = *((u64 *)&comp_wait->comp_data[2]); 486 + } 487 + put_comp_wait(cmdq, comp_wait); 488 + 489 + out: 490 + up(&cmdq->credits); 491 + 492 + return ret; 493 + }
+205
drivers/infiniband/hw/erdma/erdma_cq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #include <rdma/ib_verbs.h> 8 + 9 + #include "erdma_hw.h" 10 + #include "erdma_verbs.h" 11 + 12 + static void *get_next_valid_cqe(struct erdma_cq *cq) 13 + { 14 + __be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci, 15 + cq->depth, CQE_SHIFT); 16 + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, 17 + __be32_to_cpu(READ_ONCE(*cqe))); 18 + 19 + return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL; 20 + } 21 + 22 + static void notify_cq(struct erdma_cq *cq, u8 solcitied) 23 + { 24 + u64 db_data = 25 + FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) | 26 + FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) | 27 + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | 28 + FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) | 29 + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | 30 + FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); 31 + 32 + *cq->kern_cq.db_record = db_data; 33 + writeq(db_data, cq->kern_cq.db); 34 + } 35 + 36 + int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) 37 + { 38 + struct erdma_cq *cq = to_ecq(ibcq); 39 + unsigned long irq_flags; 40 + int ret = 0; 41 + 42 + spin_lock_irqsave(&cq->kern_cq.lock, irq_flags); 43 + 44 + notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); 45 + 46 + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq)) 47 + ret = 1; 48 + 49 + cq->kern_cq.notify_cnt++; 50 + 51 + spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags); 52 + 53 + return ret; 54 + } 55 + 56 + static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { 57 + [ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE, 58 + [ERDMA_OP_READ] = IB_WC_RDMA_READ, 59 + [ERDMA_OP_SEND] = IB_WC_SEND, 60 + [ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND, 61 + [ERDMA_OP_RECEIVE] = IB_WC_RECV, 62 + [ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM, 63 + [ERDMA_OP_RECV_INV] = IB_WC_RECV, 64 + [ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, 65 + [ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV, 66 + [ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV, 67 + [ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND, 68 + [ERDMA_OP_REG_MR] = IB_WC_REG_MR, 69 + [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, 70 + [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, 71 + }; 72 + 73 + static const struct { 74 + enum erdma_wc_status erdma; 75 + enum ib_wc_status base; 76 + enum erdma_vendor_err vendor; 77 + } map_cqe_status[ERDMA_NUM_WC_STATUS] = { 78 + { ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, 79 + { ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, 80 + { ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR, 81 + ERDMA_WC_VENDOR_INVALID_RQE }, 82 + { ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR, 83 + ERDMA_WC_VENDOR_RQE_INVALID_STAG }, 84 + { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, 85 + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, 86 + { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, 87 + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, 88 + { ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR, 89 + ERDMA_WC_VENDOR_RQE_INVALID_PD }, 90 + { ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR, 91 + ERDMA_WC_VENDOR_RQE_WRAP_ERR }, 92 + { ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR, 93 + ERDMA_WC_VENDOR_INVALID_SQE }, 94 + { ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR, 95 + ERDMA_WC_VENDOR_ZERO_ORD }, 96 + { ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR, 97 + ERDMA_WC_VENDOR_SQE_INVALID_STAG }, 98 + { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, 99 + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, 100 + { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, 101 + ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, 102 + { ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR, 103 + ERDMA_WC_VENDOR_SQE_INVALID_PD }, 104 + { ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR, 105 + ERDMA_WC_VENDOR_SQE_WARP_ERR }, 106 + { ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, 107 + { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, 108 + }; 109 + 110 + #define ERDMA_POLLCQ_NO_QP 1 111 + 112 + static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) 113 + { 114 + struct erdma_dev *dev = to_edev(cq->ibcq.device); 115 + u8 opcode, syndrome, qtype; 116 + struct erdma_kqp *kern_qp; 117 + struct erdma_cqe *cqe; 118 + struct erdma_qp *qp; 119 + u16 wqe_idx, depth; 120 + u32 qpn, cqe_hdr; 121 + u64 *id_table; 122 + u64 *wqe_hdr; 123 + 124 + cqe = get_next_valid_cqe(cq); 125 + if (!cqe) 126 + return -EAGAIN; 127 + 128 + cq->kern_cq.ci++; 129 + 130 + /* cqbuf should be ready when we poll */ 131 + dma_rmb(); 132 + 133 + qpn = be32_to_cpu(cqe->qpn); 134 + wqe_idx = be32_to_cpu(cqe->qe_idx); 135 + cqe_hdr = be32_to_cpu(cqe->hdr); 136 + 137 + qp = find_qp_by_qpn(dev, qpn); 138 + if (!qp) 139 + return ERDMA_POLLCQ_NO_QP; 140 + 141 + kern_qp = &qp->kern_qp; 142 + 143 + qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr); 144 + syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); 145 + opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); 146 + 147 + if (qtype == ERDMA_CQE_QTYPE_SQ) { 148 + id_table = kern_qp->swr_tbl; 149 + depth = qp->attrs.sq_size; 150 + wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, 151 + qp->attrs.sq_size, SQEBB_SHIFT); 152 + kern_qp->sq_ci = 153 + FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) + 154 + wqe_idx + 1; 155 + } else { 156 + id_table = kern_qp->rwr_tbl; 157 + depth = qp->attrs.rq_size; 158 + } 159 + wc->wr_id = id_table[wqe_idx & (depth - 1)]; 160 + wc->byte_len = be32_to_cpu(cqe->size); 161 + 162 + wc->wc_flags = 0; 163 + 164 + wc->opcode = wc_mapping_table[opcode]; 165 + if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { 166 + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data)); 167 + wc->wc_flags |= IB_WC_WITH_IMM; 168 + } else if (opcode == ERDMA_OP_RECV_INV) { 169 + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey); 170 + wc->wc_flags |= IB_WC_WITH_INVALIDATE; 171 + } 172 + 173 + if (syndrome >= ERDMA_NUM_WC_STATUS) 174 + syndrome = ERDMA_WC_GENERAL_ERR; 175 + 176 + wc->status = map_cqe_status[syndrome].base; 177 + wc->vendor_err = map_cqe_status[syndrome].vendor; 178 + wc->qp = &qp->ibqp; 179 + 180 + return 0; 181 + } 182 + 183 + int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) 184 + { 185 + struct erdma_cq *cq = to_ecq(ibcq); 186 + unsigned long flags; 187 + int npolled, ret; 188 + 189 + spin_lock_irqsave(&cq->kern_cq.lock, flags); 190 + 191 + for (npolled = 0; npolled < num_entries;) { 192 + ret = erdma_poll_one_cqe(cq, wc + npolled); 193 + 194 + if (ret == -EAGAIN) /* no received new CQEs. */ 195 + break; 196 + else if (ret) /* ignore invalid CQEs. */ 197 + continue; 198 + 199 + npolled++; 200 + } 201 + 202 + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); 203 + 204 + return npolled; 205 + }
+329
drivers/infiniband/hw/erdma/erdma_eq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #include <linux/errno.h> 8 + #include <linux/pci.h> 9 + #include <linux/types.h> 10 + 11 + #include "erdma.h" 12 + #include "erdma_hw.h" 13 + #include "erdma_verbs.h" 14 + 15 + #define MAX_POLL_CHUNK_SIZE 16 16 + 17 + void notify_eq(struct erdma_eq *eq) 18 + { 19 + u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | 20 + FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); 21 + 22 + *eq->db_record = db_data; 23 + writeq(db_data, eq->db_addr); 24 + 25 + atomic64_inc(&eq->notify_num); 26 + } 27 + 28 + void *get_next_valid_eqe(struct erdma_eq *eq) 29 + { 30 + u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT); 31 + u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe)); 32 + 33 + return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL; 34 + } 35 + 36 + void erdma_aeq_event_handler(struct erdma_dev *dev) 37 + { 38 + struct erdma_aeqe *aeqe; 39 + u32 cqn, qpn; 40 + struct erdma_qp *qp; 41 + struct erdma_cq *cq; 42 + struct ib_event event; 43 + u32 poll_cnt = 0; 44 + 45 + memset(&event, 0, sizeof(event)); 46 + 47 + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { 48 + aeqe = get_next_valid_eqe(&dev->aeq); 49 + if (!aeqe) 50 + break; 51 + 52 + dma_rmb(); 53 + 54 + dev->aeq.ci++; 55 + atomic64_inc(&dev->aeq.event_num); 56 + poll_cnt++; 57 + 58 + if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, 59 + le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) { 60 + cqn = le32_to_cpu(aeqe->event_data0); 61 + cq = find_cq_by_cqn(dev, cqn); 62 + if (!cq) 63 + continue; 64 + 65 + event.device = cq->ibcq.device; 66 + event.element.cq = &cq->ibcq; 67 + event.event = IB_EVENT_CQ_ERR; 68 + if (cq->ibcq.event_handler) 69 + cq->ibcq.event_handler(&event, 70 + cq->ibcq.cq_context); 71 + } else { 72 + qpn = le32_to_cpu(aeqe->event_data0); 73 + qp = find_qp_by_qpn(dev, qpn); 74 + if (!qp) 75 + continue; 76 + 77 + event.device = qp->ibqp.device; 78 + event.element.qp = &qp->ibqp; 79 + event.event = IB_EVENT_QP_FATAL; 80 + if (qp->ibqp.event_handler) 81 + qp->ibqp.event_handler(&event, 82 + qp->ibqp.qp_context); 83 + } 84 + } 85 + 86 + notify_eq(&dev->aeq); 87 + } 88 + 89 + int erdma_aeq_init(struct erdma_dev *dev) 90 + { 91 + struct erdma_eq *eq = &dev->aeq; 92 + u32 buf_size; 93 + 94 + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; 95 + buf_size = eq->depth << EQE_SHIFT; 96 + 97 + eq->qbuf = 98 + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 99 + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); 100 + if (!eq->qbuf) 101 + return -ENOMEM; 102 + 103 + spin_lock_init(&eq->lock); 104 + atomic64_set(&eq->event_num, 0); 105 + atomic64_set(&eq->notify_num, 0); 106 + 107 + eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG); 108 + eq->db_record = (u64 *)(eq->qbuf + buf_size); 109 + 110 + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, 111 + upper_32_bits(eq->qbuf_dma_addr)); 112 + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, 113 + lower_32_bits(eq->qbuf_dma_addr)); 114 + erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); 115 + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, 116 + eq->qbuf_dma_addr + buf_size); 117 + 118 + return 0; 119 + } 120 + 121 + void erdma_aeq_destroy(struct erdma_dev *dev) 122 + { 123 + struct erdma_eq *eq = &dev->aeq; 124 + 125 + dma_free_coherent(&dev->pdev->dev, 126 + WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, 127 + eq->qbuf_dma_addr); 128 + } 129 + 130 + void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) 131 + { 132 + struct erdma_dev *dev = ceq_cb->dev; 133 + struct erdma_cq *cq; 134 + u32 poll_cnt = 0; 135 + u64 *ceqe; 136 + int cqn; 137 + 138 + if (!ceq_cb->ready) 139 + return; 140 + 141 + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { 142 + ceqe = get_next_valid_eqe(&ceq_cb->eq); 143 + if (!ceqe) 144 + break; 145 + 146 + dma_rmb(); 147 + ceq_cb->eq.ci++; 148 + poll_cnt++; 149 + cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe)); 150 + 151 + cq = find_cq_by_cqn(dev, cqn); 152 + if (!cq) 153 + continue; 154 + 155 + if (rdma_is_kernel_res(&cq->ibcq.res)) 156 + cq->kern_cq.cmdsn++; 157 + 158 + if (cq->ibcq.comp_handler) 159 + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); 160 + } 161 + 162 + notify_eq(&ceq_cb->eq); 163 + } 164 + 165 + static irqreturn_t erdma_intr_ceq_handler(int irq, void *data) 166 + { 167 + struct erdma_eq_cb *ceq_cb = data; 168 + 169 + tasklet_schedule(&ceq_cb->tasklet); 170 + 171 + return IRQ_HANDLED; 172 + } 173 + 174 + static void erdma_intr_ceq_task(unsigned long data) 175 + { 176 + erdma_ceq_completion_handler((struct erdma_eq_cb *)data); 177 + } 178 + 179 + static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn) 180 + { 181 + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; 182 + int err; 183 + 184 + snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn, 185 + pci_name(dev->pdev)); 186 + eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1); 187 + 188 + tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task, 189 + (unsigned long)&dev->ceqs[ceqn]); 190 + 191 + cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node), 192 + &eqc->irq.affinity_hint_mask); 193 + 194 + err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0, 195 + eqc->irq.name, eqc); 196 + if (err) { 197 + dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err); 198 + return err; 199 + } 200 + 201 + irq_set_affinity_hint(eqc->irq.msix_vector, 202 + &eqc->irq.affinity_hint_mask); 203 + 204 + return 0; 205 + } 206 + 207 + static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) 208 + { 209 + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; 210 + 211 + irq_set_affinity_hint(eqc->irq.msix_vector, NULL); 212 + free_irq(eqc->irq.msix_vector, eqc); 213 + } 214 + 215 + static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) 216 + { 217 + struct erdma_cmdq_create_eq_req req; 218 + dma_addr_t db_info_dma_addr; 219 + 220 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, 221 + CMDQ_OPCODE_CREATE_EQ); 222 + req.eqn = eqn; 223 + req.depth = ilog2(eq->depth); 224 + req.qbuf_addr = eq->qbuf_dma_addr; 225 + req.qtype = ERDMA_EQ_TYPE_CEQ; 226 + /* Vector index is the same as EQN. */ 227 + req.vector_idx = eqn; 228 + db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); 229 + req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); 230 + req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); 231 + 232 + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, 233 + sizeof(struct erdma_cmdq_create_eq_req), 234 + NULL, NULL); 235 + } 236 + 237 + static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) 238 + { 239 + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; 240 + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; 241 + int ret; 242 + 243 + eq->qbuf = 244 + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 245 + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); 246 + if (!eq->qbuf) 247 + return -ENOMEM; 248 + 249 + spin_lock_init(&eq->lock); 250 + atomic64_set(&eq->event_num, 0); 251 + atomic64_set(&eq->notify_num, 0); 252 + 253 + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; 254 + eq->db_addr = 255 + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + 256 + (ceqn + 1) * ERDMA_DB_SIZE); 257 + eq->db_record = (u64 *)(eq->qbuf + buf_size); 258 + eq->ci = 0; 259 + dev->ceqs[ceqn].dev = dev; 260 + 261 + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ 262 + ret = create_eq_cmd(dev, ceqn + 1, eq); 263 + dev->ceqs[ceqn].ready = ret ? false : true; 264 + 265 + return ret; 266 + } 267 + 268 + static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) 269 + { 270 + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; 271 + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; 272 + struct erdma_cmdq_destroy_eq_req req; 273 + int err; 274 + 275 + dev->ceqs[ceqn].ready = 0; 276 + 277 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, 278 + CMDQ_OPCODE_DESTROY_EQ); 279 + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ 280 + req.eqn = ceqn + 1; 281 + req.qtype = ERDMA_EQ_TYPE_CEQ; 282 + req.vector_idx = ceqn + 1; 283 + 284 + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 285 + NULL); 286 + if (err) 287 + return; 288 + 289 + dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, 290 + eq->qbuf_dma_addr); 291 + } 292 + 293 + int erdma_ceqs_init(struct erdma_dev *dev) 294 + { 295 + u32 i, j; 296 + int err; 297 + 298 + for (i = 0; i < dev->attrs.irq_num - 1; i++) { 299 + err = erdma_ceq_init_one(dev, i); 300 + if (err) 301 + goto out_err; 302 + 303 + err = erdma_set_ceq_irq(dev, i); 304 + if (err) { 305 + erdma_ceq_uninit_one(dev, i); 306 + goto out_err; 307 + } 308 + } 309 + 310 + return 0; 311 + 312 + out_err: 313 + for (j = 0; j < i; j++) { 314 + erdma_free_ceq_irq(dev, j); 315 + erdma_ceq_uninit_one(dev, j); 316 + } 317 + 318 + return err; 319 + } 320 + 321 + void erdma_ceqs_uninit(struct erdma_dev *dev) 322 + { 323 + u32 i; 324 + 325 + for (i = 0; i < dev->attrs.irq_num - 1; i++) { 326 + erdma_free_ceq_irq(dev, i); 327 + erdma_ceq_uninit_one(dev, i); 328 + } 329 + }
+508
drivers/infiniband/hw/erdma/erdma_hw.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #ifndef __ERDMA_HW_H__ 8 + #define __ERDMA_HW_H__ 9 + 10 + #include <linux/kernel.h> 11 + #include <linux/types.h> 12 + 13 + /* PCIe device related definition. */ 14 + #define PCI_VENDOR_ID_ALIBABA 0x1ded 15 + 16 + #define ERDMA_PCI_WIDTH 64 17 + #define ERDMA_FUNC_BAR 0 18 + #define ERDMA_MISX_BAR 2 19 + 20 + #define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR)) 21 + 22 + /* MSI-X related. */ 23 + #define ERDMA_NUM_MSIX_VEC 32U 24 + #define ERDMA_MSIX_VECTOR_CMDQ 0 25 + 26 + /* PCIe Bar0 Registers. */ 27 + #define ERDMA_REGS_VERSION_REG 0x0 28 + #define ERDMA_REGS_DEV_CTRL_REG 0x10 29 + #define ERDMA_REGS_DEV_ST_REG 0x14 30 + #define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 31 + #define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C 32 + #define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20 33 + #define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24 34 + #define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28 35 + #define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C 36 + #define ERDMA_REGS_CMDQ_DEPTH_REG 0x30 37 + #define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34 38 + #define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38 39 + #define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C 40 + #define ERDMA_REGS_AEQ_ADDR_L_REG 0x40 41 + #define ERDMA_REGS_AEQ_ADDR_H_REG 0x44 42 + #define ERDMA_REGS_AEQ_DEPTH_REG 0x48 43 + #define ERDMA_REGS_GRP_NUM_REG 0x4c 44 + #define ERDMA_REGS_AEQ_DB_REG 0x50 45 + #define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60 46 + #define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68 47 + #define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70 48 + #define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78 49 + #define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80 50 + #define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88 51 + #define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90 52 + #define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98 53 + #define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0 54 + #define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8 55 + #define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0 56 + #define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8 57 + #define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0 58 + #define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8 59 + #define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0 60 + #define ERDMA_REGS_CEQ_DB_BASE_REG 0x100 61 + #define ERDMA_CMDQ_SQDB_REG 0x200 62 + #define ERDMA_CMDQ_CQDB_REG 0x300 63 + 64 + /* DEV_CTRL_REG details. */ 65 + #define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001 66 + #define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002 67 + 68 + /* DEV_ST_REG details. */ 69 + #define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U 70 + #define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U 71 + 72 + /* eRDMA PCIe DBs definition. */ 73 + #define ERDMA_BAR_DB_SPACE_BASE 4096 74 + 75 + #define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE 76 + #define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024) 77 + 78 + #define ERDMA_BAR_RQDB_SPACE_OFFSET \ 79 + (ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE) 80 + #define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024) 81 + 82 + #define ERDMA_BAR_CQDB_SPACE_OFFSET \ 83 + (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) 84 + 85 + /* Doorbell page resources related. */ 86 + /* 87 + * Max # of parallelly issued directSQE is 3072 per device, 88 + * hardware organizes this into 24 group, per group has 128 credits. 89 + */ 90 + #define ERDMA_DWQE_MAX_GRP_CNT 24 91 + #define ERDMA_DWQE_NUM_PER_GRP 128 92 + 93 + #define ERDMA_DWQE_TYPE0_CNT 64 94 + #define ERDMA_DWQE_TYPE1_CNT 496 95 + /* type1 DB contains 2 DBs, takes 256Byte. */ 96 + #define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 97 + 98 + #define ERDMA_SDB_SHARED_PAGE_INDEX 95 99 + 100 + /* Doorbell related. */ 101 + #define ERDMA_DB_SIZE 8 102 + 103 + #define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56) 104 + #define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32) 105 + #define ERDMA_CQDB_ARM_MASK BIT_ULL(31) 106 + #define ERDMA_CQDB_SOL_MASK BIT_ULL(30) 107 + #define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28) 108 + #define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0) 109 + 110 + #define ERDMA_EQDB_ARM_MASK BIT(31) 111 + #define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0) 112 + 113 + #define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000 114 + 115 + /* WQE related. */ 116 + #define EQE_SIZE 16 117 + #define EQE_SHIFT 4 118 + #define RQE_SIZE 32 119 + #define RQE_SHIFT 5 120 + #define CQE_SIZE 32 121 + #define CQE_SHIFT 5 122 + #define SQEBB_SIZE 32 123 + #define SQEBB_SHIFT 5 124 + #define SQEBB_MASK (~(SQEBB_SIZE - 1)) 125 + #define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK) 126 + #define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT) 127 + 128 + #define ERDMA_MAX_SQE_SIZE 128 129 + #define ERDMA_MAX_WQEBB_PER_SQE 4 130 + 131 + /* CMDQ related. */ 132 + #define ERDMA_CMDQ_MAX_OUTSTANDING 128 133 + #define ERDMA_CMDQ_SQE_SIZE 64 134 + 135 + /* cmdq sub module definition. */ 136 + enum CMDQ_WQE_SUB_MOD { 137 + CMDQ_SUBMOD_RDMA = 0, 138 + CMDQ_SUBMOD_COMMON = 1 139 + }; 140 + 141 + enum CMDQ_RDMA_OPCODE { 142 + CMDQ_OPCODE_QUERY_DEVICE = 0, 143 + CMDQ_OPCODE_CREATE_QP = 1, 144 + CMDQ_OPCODE_DESTROY_QP = 2, 145 + CMDQ_OPCODE_MODIFY_QP = 3, 146 + CMDQ_OPCODE_CREATE_CQ = 4, 147 + CMDQ_OPCODE_DESTROY_CQ = 5, 148 + CMDQ_OPCODE_REG_MR = 8, 149 + CMDQ_OPCODE_DEREG_MR = 9 150 + }; 151 + 152 + enum CMDQ_COMMON_OPCODE { 153 + CMDQ_OPCODE_CREATE_EQ = 0, 154 + CMDQ_OPCODE_DESTROY_EQ = 1, 155 + CMDQ_OPCODE_QUERY_FW_INFO = 2, 156 + }; 157 + 158 + /* cmdq-SQE HDR */ 159 + #define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) 160 + #define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32) 161 + #define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24) 162 + #define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16) 163 + #define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) 164 + 165 + struct erdma_cmdq_destroy_cq_req { 166 + u64 hdr; 167 + u32 cqn; 168 + }; 169 + 170 + #define ERDMA_EQ_TYPE_AEQ 0 171 + #define ERDMA_EQ_TYPE_CEQ 1 172 + 173 + struct erdma_cmdq_create_eq_req { 174 + u64 hdr; 175 + u64 qbuf_addr; 176 + u8 vector_idx; 177 + u8 eqn; 178 + u8 depth; 179 + u8 qtype; 180 + u32 db_dma_addr_l; 181 + u32 db_dma_addr_h; 182 + }; 183 + 184 + struct erdma_cmdq_destroy_eq_req { 185 + u64 hdr; 186 + u64 rsvd0; 187 + u8 vector_idx; 188 + u8 eqn; 189 + u8 rsvd1; 190 + u8 qtype; 191 + }; 192 + 193 + /* create_cq cfg0 */ 194 + #define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) 195 + #define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) 196 + #define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0) 197 + 198 + /* create_cq cfg1 */ 199 + #define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) 200 + #define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) 201 + #define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) 202 + 203 + struct erdma_cmdq_create_cq_req { 204 + u64 hdr; 205 + u32 cfg0; 206 + u32 qbuf_addr_l; 207 + u32 qbuf_addr_h; 208 + u32 cfg1; 209 + u64 cq_db_info_addr; 210 + u32 first_page_offset; 211 + }; 212 + 213 + /* regmr/deregmr cfg0 */ 214 + #define ERDMA_CMD_MR_VALID_MASK BIT(31) 215 + #define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20) 216 + #define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0) 217 + 218 + /* regmr cfg1 */ 219 + #define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) 220 + #define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) 221 + #define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) 222 + #define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) 223 + 224 + /* regmr cfg2 */ 225 + #define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) 226 + #define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20) 227 + #define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0) 228 + 229 + struct erdma_cmdq_reg_mr_req { 230 + u64 hdr; 231 + u32 cfg0; 232 + u32 cfg1; 233 + u64 start_va; 234 + u32 size; 235 + u32 cfg2; 236 + u64 phy_addr[4]; 237 + }; 238 + 239 + struct erdma_cmdq_dereg_mr_req { 240 + u64 hdr; 241 + u32 cfg; 242 + }; 243 + 244 + /* modify qp cfg */ 245 + #define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) 246 + #define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) 247 + #define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0) 248 + 249 + struct erdma_cmdq_modify_qp_req { 250 + u64 hdr; 251 + u32 cfg; 252 + u32 cookie; 253 + __be32 dip; 254 + __be32 sip; 255 + __be16 sport; 256 + __be16 dport; 257 + u32 send_nxt; 258 + u32 recv_nxt; 259 + }; 260 + 261 + /* create qp cfg0 */ 262 + #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) 263 + #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) 264 + 265 + /* create qp cfg1 */ 266 + #define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) 267 + #define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) 268 + 269 + /* create qp cqn_mtt_cfg */ 270 + #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) 271 + #define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) 272 + 273 + /* create qp mtt_cfg */ 274 + #define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12) 275 + #define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) 276 + #define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) 277 + 278 + #define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) 279 + 280 + struct erdma_cmdq_create_qp_req { 281 + u64 hdr; 282 + u32 cfg0; 283 + u32 cfg1; 284 + u32 sq_cqn_mtt_cfg; 285 + u32 rq_cqn_mtt_cfg; 286 + u64 sq_buf_addr; 287 + u64 rq_buf_addr; 288 + u32 sq_mtt_cfg; 289 + u32 rq_mtt_cfg; 290 + u64 sq_db_info_dma_addr; 291 + u64 rq_db_info_dma_addr; 292 + }; 293 + 294 + struct erdma_cmdq_destroy_qp_req { 295 + u64 hdr; 296 + u32 qpn; 297 + }; 298 + 299 + /* cap qword 0 definition */ 300 + #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) 301 + #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) 302 + #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) 303 + 304 + /* cap qword 1 definition */ 305 + #define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32) 306 + #define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28) 307 + #define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16) 308 + #define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0) 309 + 310 + #define ERDMA_NQP_PER_QBLOCK 1024 311 + 312 + #define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) 313 + 314 + /* CQE hdr */ 315 + #define ERDMA_CQE_HDR_OWNER_MASK BIT(31) 316 + #define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16) 317 + #define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8) 318 + #define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0) 319 + 320 + #define ERDMA_CQE_QTYPE_SQ 0 321 + #define ERDMA_CQE_QTYPE_RQ 1 322 + #define ERDMA_CQE_QTYPE_CMDQ 2 323 + 324 + struct erdma_cqe { 325 + __be32 hdr; 326 + __be32 qe_idx; 327 + __be32 qpn; 328 + union { 329 + __le32 imm_data; 330 + __be32 inv_rkey; 331 + }; 332 + __be32 size; 333 + __be32 rsvd[3]; 334 + }; 335 + 336 + struct erdma_sge { 337 + __aligned_le64 laddr; 338 + __le32 length; 339 + __le32 lkey; 340 + }; 341 + 342 + /* Receive Queue Element */ 343 + struct erdma_rqe { 344 + __le16 qe_idx; 345 + __le16 rsvd0; 346 + __le32 qpn; 347 + __le32 rsvd1; 348 + __le32 rsvd2; 349 + __le64 to; 350 + __le32 length; 351 + __le32 stag; 352 + }; 353 + 354 + /* SQE */ 355 + #define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56) 356 + #define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) 357 + #define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32) 358 + #define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27) 359 + #define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26) 360 + #define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25) 361 + #define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24) 362 + #define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23) 363 + #define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22) 364 + #define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) 365 + 366 + /* REG MR attrs */ 367 + #define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) 368 + #define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) 369 + #define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) 370 + #define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) 371 + 372 + struct erdma_write_sqe { 373 + __le64 hdr; 374 + __be32 imm_data; 375 + __le32 length; 376 + 377 + __le32 sink_stag; 378 + __le32 sink_to_l; 379 + __le32 sink_to_h; 380 + 381 + __le32 rsvd; 382 + 383 + struct erdma_sge sgl[0]; 384 + }; 385 + 386 + struct erdma_send_sqe { 387 + __le64 hdr; 388 + union { 389 + __be32 imm_data; 390 + __le32 invalid_stag; 391 + }; 392 + 393 + __le32 length; 394 + struct erdma_sge sgl[0]; 395 + }; 396 + 397 + struct erdma_readreq_sqe { 398 + __le64 hdr; 399 + __le32 invalid_stag; 400 + __le32 length; 401 + __le32 sink_stag; 402 + __le32 sink_to_l; 403 + __le32 sink_to_h; 404 + __le32 rsvd; 405 + }; 406 + 407 + struct erdma_reg_mr_sqe { 408 + __le64 hdr; 409 + __le64 addr; 410 + __le32 length; 411 + __le32 stag; 412 + __le32 attrs; 413 + __le32 rsvd; 414 + }; 415 + 416 + /* EQ related. */ 417 + #define ERDMA_DEFAULT_EQ_DEPTH 256 418 + 419 + /* ceqe */ 420 + #define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) 421 + #define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32) 422 + #define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31) 423 + #define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0) 424 + 425 + /* aeqe */ 426 + #define ERDMA_AEQE_HDR_O_MASK BIT(31) 427 + #define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16) 428 + #define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0) 429 + 430 + #define ERDMA_AE_TYPE_QP_FATAL_EVENT 0 431 + #define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1 432 + #define ERDMA_AE_TYPE_ACC_ERR_EVENT 2 433 + #define ERDMA_AE_TYPE_CQ_ERR 3 434 + #define ERDMA_AE_TYPE_OTHER_ERROR 4 435 + 436 + struct erdma_aeqe { 437 + __le32 hdr; 438 + __le32 event_data0; 439 + __le32 event_data1; 440 + __le32 rsvd; 441 + }; 442 + 443 + enum erdma_opcode { 444 + ERDMA_OP_WRITE = 0, 445 + ERDMA_OP_READ = 1, 446 + ERDMA_OP_SEND = 2, 447 + ERDMA_OP_SEND_WITH_IMM = 3, 448 + 449 + ERDMA_OP_RECEIVE = 4, 450 + ERDMA_OP_RECV_IMM = 5, 451 + ERDMA_OP_RECV_INV = 6, 452 + 453 + ERDMA_OP_REQ_ERR = 7, 454 + ERDMA_OP_READ_RESPONSE = 8, 455 + ERDMA_OP_WRITE_WITH_IMM = 9, 456 + 457 + ERDMA_OP_RECV_ERR = 10, 458 + 459 + ERDMA_OP_INVALIDATE = 11, 460 + ERDMA_OP_RSP_SEND_IMM = 12, 461 + ERDMA_OP_SEND_WITH_INV = 13, 462 + 463 + ERDMA_OP_REG_MR = 14, 464 + ERDMA_OP_LOCAL_INV = 15, 465 + ERDMA_OP_READ_WITH_INV = 16, 466 + ERDMA_NUM_OPCODES = 17, 467 + ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 468 + }; 469 + 470 + enum erdma_wc_status { 471 + ERDMA_WC_SUCCESS = 0, 472 + ERDMA_WC_GENERAL_ERR = 1, 473 + ERDMA_WC_RECV_WQE_FORMAT_ERR = 2, 474 + ERDMA_WC_RECV_STAG_INVALID_ERR = 3, 475 + ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4, 476 + ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5, 477 + ERDMA_WC_RECV_PDID_ERR = 6, 478 + ERDMA_WC_RECV_WARRPING_ERR = 7, 479 + ERDMA_WC_SEND_WQE_FORMAT_ERR = 8, 480 + ERDMA_WC_SEND_WQE_ORD_EXCEED = 9, 481 + ERDMA_WC_SEND_STAG_INVALID_ERR = 10, 482 + ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11, 483 + ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12, 484 + ERDMA_WC_SEND_PDID_ERR = 13, 485 + ERDMA_WC_SEND_WARRPING_ERR = 14, 486 + ERDMA_WC_FLUSH_ERR = 15, 487 + ERDMA_WC_RETRY_EXC_ERR = 16, 488 + ERDMA_NUM_WC_STATUS 489 + }; 490 + 491 + enum erdma_vendor_err { 492 + ERDMA_WC_VENDOR_NO_ERR = 0, 493 + ERDMA_WC_VENDOR_INVALID_RQE = 1, 494 + ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2, 495 + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3, 496 + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4, 497 + ERDMA_WC_VENDOR_RQE_INVALID_PD = 5, 498 + ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6, 499 + ERDMA_WC_VENDOR_INVALID_SQE = 0x20, 500 + ERDMA_WC_VENDOR_ZERO_ORD = 0x21, 501 + ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30, 502 + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31, 503 + ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32, 504 + ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33, 505 + ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34 506 + }; 507 + 508 + #endif
+608
drivers/infiniband/hw/erdma/erdma_main.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #include <linux/errno.h> 8 + #include <linux/init.h> 9 + #include <linux/kernel.h> 10 + #include <linux/list.h> 11 + #include <linux/module.h> 12 + #include <linux/netdevice.h> 13 + #include <linux/pci.h> 14 + #include <net/addrconf.h> 15 + #include <rdma/erdma-abi.h> 16 + #include <rdma/ib_verbs.h> 17 + #include <rdma/ib_user_verbs.h> 18 + 19 + #include "erdma.h" 20 + #include "erdma_cm.h" 21 + #include "erdma_hw.h" 22 + #include "erdma_verbs.h" 23 + 24 + MODULE_AUTHOR("Cheng Xu <chengyou@linux.alibaba.com>"); 25 + MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver"); 26 + MODULE_LICENSE("Dual BSD/GPL"); 27 + 28 + static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, 29 + void *arg) 30 + { 31 + struct net_device *netdev = netdev_notifier_info_to_dev(arg); 32 + struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb); 33 + 34 + if (dev->netdev == NULL || dev->netdev != netdev) 35 + goto done; 36 + 37 + switch (event) { 38 + case NETDEV_UP: 39 + dev->state = IB_PORT_ACTIVE; 40 + erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); 41 + break; 42 + case NETDEV_DOWN: 43 + dev->state = IB_PORT_DOWN; 44 + erdma_port_event(dev, IB_EVENT_PORT_ERR); 45 + break; 46 + case NETDEV_REGISTER: 47 + case NETDEV_UNREGISTER: 48 + case NETDEV_CHANGEADDR: 49 + case NETDEV_CHANGEMTU: 50 + case NETDEV_GOING_DOWN: 51 + case NETDEV_CHANGE: 52 + default: 53 + break; 54 + } 55 + 56 + done: 57 + return NOTIFY_OK; 58 + } 59 + 60 + static int erdma_enum_and_get_netdev(struct erdma_dev *dev) 61 + { 62 + struct net_device *netdev; 63 + int ret = -ENODEV; 64 + 65 + /* Already binded to a net_device, so we skip. */ 66 + if (dev->netdev) 67 + return 0; 68 + 69 + rtnl_lock(); 70 + for_each_netdev(&init_net, netdev) { 71 + /* 72 + * In erdma, the paired netdev and ibdev should have the same 73 + * MAC address. erdma can get the value from its PCIe bar 74 + * registers. Since erdma can not get the paired netdev 75 + * reference directly, we do a traverse here to get the paired 76 + * netdev. 77 + */ 78 + if (ether_addr_equal_unaligned(netdev->perm_addr, 79 + dev->attrs.peer_addr)) { 80 + ret = ib_device_set_netdev(&dev->ibdev, netdev, 1); 81 + if (ret) { 82 + rtnl_unlock(); 83 + ibdev_warn(&dev->ibdev, 84 + "failed (%d) to link netdev", ret); 85 + return ret; 86 + } 87 + 88 + dev->netdev = netdev; 89 + break; 90 + } 91 + } 92 + 93 + rtnl_unlock(); 94 + 95 + return ret; 96 + } 97 + 98 + static int erdma_device_register(struct erdma_dev *dev) 99 + { 100 + struct ib_device *ibdev = &dev->ibdev; 101 + int ret; 102 + 103 + ret = erdma_enum_and_get_netdev(dev); 104 + if (ret) 105 + return ret; 106 + 107 + addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr); 108 + 109 + ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev); 110 + if (ret) { 111 + dev_err(&dev->pdev->dev, 112 + "ib_register_device failed: ret = %d\n", ret); 113 + return ret; 114 + } 115 + 116 + dev->netdev_nb.notifier_call = erdma_netdev_event; 117 + ret = register_netdevice_notifier(&dev->netdev_nb); 118 + if (ret) { 119 + ibdev_err(&dev->ibdev, "failed to register notifier.\n"); 120 + ib_unregister_device(ibdev); 121 + } 122 + 123 + return ret; 124 + } 125 + 126 + static irqreturn_t erdma_comm_irq_handler(int irq, void *data) 127 + { 128 + struct erdma_dev *dev = data; 129 + 130 + erdma_cmdq_completion_handler(&dev->cmdq); 131 + erdma_aeq_event_handler(dev); 132 + 133 + return IRQ_HANDLED; 134 + } 135 + 136 + static void erdma_dwqe_resource_init(struct erdma_dev *dev) 137 + { 138 + int total_pages, type0, type1; 139 + 140 + dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); 141 + 142 + if (dev->attrs.grp_num < 4) 143 + dev->attrs.disable_dwqe = true; 144 + else 145 + dev->attrs.disable_dwqe = false; 146 + 147 + /* One page contains 4 goups. */ 148 + total_pages = dev->attrs.grp_num * 4; 149 + 150 + if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { 151 + dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; 152 + type0 = ERDMA_DWQE_TYPE0_CNT; 153 + type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; 154 + } else { 155 + type1 = total_pages / 3; 156 + type0 = total_pages - type1 - 1; 157 + } 158 + 159 + dev->attrs.dwqe_pages = type0; 160 + dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; 161 + } 162 + 163 + static int erdma_request_vectors(struct erdma_dev *dev) 164 + { 165 + int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); 166 + int ret; 167 + 168 + ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX); 169 + if (ret < 0) { 170 + dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n", 171 + ret); 172 + return ret; 173 + } 174 + dev->attrs.irq_num = ret; 175 + 176 + return 0; 177 + } 178 + 179 + static int erdma_comm_irq_init(struct erdma_dev *dev) 180 + { 181 + snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s", 182 + pci_name(dev->pdev)); 183 + dev->comm_irq.msix_vector = 184 + pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ); 185 + 186 + cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)), 187 + &dev->comm_irq.affinity_hint_mask); 188 + irq_set_affinity_hint(dev->comm_irq.msix_vector, 189 + &dev->comm_irq.affinity_hint_mask); 190 + 191 + return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0, 192 + dev->comm_irq.name, dev); 193 + } 194 + 195 + static void erdma_comm_irq_uninit(struct erdma_dev *dev) 196 + { 197 + irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL); 198 + free_irq(dev->comm_irq.msix_vector, dev); 199 + } 200 + 201 + static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) 202 + { 203 + int ret; 204 + 205 + erdma_dwqe_resource_init(dev); 206 + 207 + ret = dma_set_mask_and_coherent(&pdev->dev, 208 + DMA_BIT_MASK(ERDMA_PCI_WIDTH)); 209 + if (ret) 210 + return ret; 211 + 212 + dma_set_max_seg_size(&pdev->dev, UINT_MAX); 213 + 214 + return 0; 215 + } 216 + 217 + static void erdma_device_uninit(struct erdma_dev *dev) 218 + { 219 + u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); 220 + 221 + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); 222 + } 223 + 224 + static const struct pci_device_id erdma_pci_tbl[] = { 225 + { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, 226 + {} 227 + }; 228 + 229 + static int erdma_probe_dev(struct pci_dev *pdev) 230 + { 231 + struct erdma_dev *dev; 232 + int bars, err; 233 + u32 version; 234 + 235 + err = pci_enable_device(pdev); 236 + if (err) { 237 + dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err); 238 + return err; 239 + } 240 + 241 + pci_set_master(pdev); 242 + 243 + dev = ib_alloc_device(erdma_dev, ibdev); 244 + if (!dev) { 245 + dev_err(&pdev->dev, "ib_alloc_device failed\n"); 246 + err = -ENOMEM; 247 + goto err_disable_device; 248 + } 249 + 250 + pci_set_drvdata(pdev, dev); 251 + dev->pdev = pdev; 252 + dev->attrs.numa_node = dev_to_node(&pdev->dev); 253 + 254 + bars = pci_select_bars(pdev, IORESOURCE_MEM); 255 + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); 256 + if (bars != ERDMA_BAR_MASK || err) { 257 + err = err ? err : -EINVAL; 258 + goto err_ib_device_release; 259 + } 260 + 261 + dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR); 262 + dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR); 263 + 264 + dev->func_bar = 265 + devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len); 266 + if (!dev->func_bar) { 267 + dev_err(&pdev->dev, "devm_ioremap failed.\n"); 268 + err = -EFAULT; 269 + goto err_release_bars; 270 + } 271 + 272 + version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); 273 + if (version == 0) { 274 + /* we knows that it is a non-functional function. */ 275 + err = -ENODEV; 276 + goto err_iounmap_func_bar; 277 + } 278 + 279 + err = erdma_device_init(dev, pdev); 280 + if (err) 281 + goto err_iounmap_func_bar; 282 + 283 + err = erdma_request_vectors(dev); 284 + if (err) 285 + goto err_iounmap_func_bar; 286 + 287 + err = erdma_comm_irq_init(dev); 288 + if (err) 289 + goto err_free_vectors; 290 + 291 + err = erdma_aeq_init(dev); 292 + if (err) 293 + goto err_uninit_comm_irq; 294 + 295 + err = erdma_cmdq_init(dev); 296 + if (err) 297 + goto err_uninit_aeq; 298 + 299 + err = erdma_ceqs_init(dev); 300 + if (err) 301 + goto err_uninit_cmdq; 302 + 303 + erdma_finish_cmdq_init(dev); 304 + 305 + return 0; 306 + 307 + err_uninit_cmdq: 308 + erdma_device_uninit(dev); 309 + erdma_cmdq_destroy(dev); 310 + 311 + err_uninit_aeq: 312 + erdma_aeq_destroy(dev); 313 + 314 + err_uninit_comm_irq: 315 + erdma_comm_irq_uninit(dev); 316 + 317 + err_free_vectors: 318 + pci_free_irq_vectors(dev->pdev); 319 + 320 + err_iounmap_func_bar: 321 + devm_iounmap(&pdev->dev, dev->func_bar); 322 + 323 + err_release_bars: 324 + pci_release_selected_regions(pdev, bars); 325 + 326 + err_ib_device_release: 327 + ib_dealloc_device(&dev->ibdev); 328 + 329 + err_disable_device: 330 + pci_disable_device(pdev); 331 + 332 + return err; 333 + } 334 + 335 + static void erdma_remove_dev(struct pci_dev *pdev) 336 + { 337 + struct erdma_dev *dev = pci_get_drvdata(pdev); 338 + 339 + erdma_ceqs_uninit(dev); 340 + 341 + erdma_device_uninit(dev); 342 + 343 + erdma_cmdq_destroy(dev); 344 + erdma_aeq_destroy(dev); 345 + erdma_comm_irq_uninit(dev); 346 + pci_free_irq_vectors(dev->pdev); 347 + 348 + devm_iounmap(&pdev->dev, dev->func_bar); 349 + pci_release_selected_regions(pdev, ERDMA_BAR_MASK); 350 + 351 + ib_dealloc_device(&dev->ibdev); 352 + 353 + pci_disable_device(pdev); 354 + } 355 + 356 + #define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap) 357 + 358 + static int erdma_dev_attrs_init(struct erdma_dev *dev) 359 + { 360 + int err; 361 + u64 req_hdr, cap0, cap1; 362 + 363 + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA, 364 + CMDQ_OPCODE_QUERY_DEVICE); 365 + 366 + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, 367 + &cap1); 368 + if (err) 369 + return err; 370 + 371 + dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0); 372 + dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); 373 + dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); 374 + dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); 375 + dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); 376 + dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); 377 + dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); 378 + dev->attrs.max_mr = dev->attrs.max_qp << 1; 379 + dev->attrs.max_cq = dev->attrs.max_qp << 1; 380 + 381 + dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; 382 + dev->attrs.max_ord = ERDMA_MAX_ORD; 383 + dev->attrs.max_ird = ERDMA_MAX_IRD; 384 + dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE; 385 + dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE; 386 + dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD; 387 + dev->attrs.max_pd = ERDMA_MAX_PD; 388 + 389 + dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; 390 + dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; 391 + 392 + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, 393 + CMDQ_OPCODE_QUERY_FW_INFO); 394 + 395 + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, 396 + &cap1); 397 + if (!err) 398 + dev->attrs.fw_version = 399 + FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); 400 + 401 + return err; 402 + } 403 + 404 + static int erdma_res_cb_init(struct erdma_dev *dev) 405 + { 406 + int i, j; 407 + 408 + for (i = 0; i < ERDMA_RES_CNT; i++) { 409 + dev->res_cb[i].next_alloc_idx = 1; 410 + spin_lock_init(&dev->res_cb[i].lock); 411 + dev->res_cb[i].bitmap = 412 + bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL); 413 + if (!dev->res_cb[i].bitmap) 414 + goto err; 415 + } 416 + 417 + return 0; 418 + 419 + err: 420 + for (j = 0; j < i; j++) 421 + bitmap_free(dev->res_cb[j].bitmap); 422 + 423 + return -ENOMEM; 424 + } 425 + 426 + static void erdma_res_cb_free(struct erdma_dev *dev) 427 + { 428 + int i; 429 + 430 + for (i = 0; i < ERDMA_RES_CNT; i++) 431 + bitmap_free(dev->res_cb[i].bitmap); 432 + } 433 + 434 + static const struct ib_device_ops erdma_device_ops = { 435 + .owner = THIS_MODULE, 436 + .driver_id = RDMA_DRIVER_ERDMA, 437 + .uverbs_abi_ver = ERDMA_ABI_VERSION, 438 + 439 + .alloc_mr = erdma_ib_alloc_mr, 440 + .alloc_pd = erdma_alloc_pd, 441 + .alloc_ucontext = erdma_alloc_ucontext, 442 + .create_cq = erdma_create_cq, 443 + .create_qp = erdma_create_qp, 444 + .dealloc_pd = erdma_dealloc_pd, 445 + .dealloc_ucontext = erdma_dealloc_ucontext, 446 + .dereg_mr = erdma_dereg_mr, 447 + .destroy_cq = erdma_destroy_cq, 448 + .destroy_qp = erdma_destroy_qp, 449 + .get_dma_mr = erdma_get_dma_mr, 450 + .get_port_immutable = erdma_get_port_immutable, 451 + .iw_accept = erdma_accept, 452 + .iw_add_ref = erdma_qp_get_ref, 453 + .iw_connect = erdma_connect, 454 + .iw_create_listen = erdma_create_listen, 455 + .iw_destroy_listen = erdma_destroy_listen, 456 + .iw_get_qp = erdma_get_ibqp, 457 + .iw_reject = erdma_reject, 458 + .iw_rem_ref = erdma_qp_put_ref, 459 + .map_mr_sg = erdma_map_mr_sg, 460 + .mmap = erdma_mmap, 461 + .mmap_free = erdma_mmap_free, 462 + .modify_qp = erdma_modify_qp, 463 + .post_recv = erdma_post_recv, 464 + .post_send = erdma_post_send, 465 + .poll_cq = erdma_poll_cq, 466 + .query_device = erdma_query_device, 467 + .query_gid = erdma_query_gid, 468 + .query_port = erdma_query_port, 469 + .query_qp = erdma_query_qp, 470 + .req_notify_cq = erdma_req_notify_cq, 471 + .reg_user_mr = erdma_reg_user_mr, 472 + 473 + INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), 474 + INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), 475 + INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext), 476 + INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp), 477 + }; 478 + 479 + static int erdma_ib_device_add(struct pci_dev *pdev) 480 + { 481 + struct erdma_dev *dev = pci_get_drvdata(pdev); 482 + struct ib_device *ibdev = &dev->ibdev; 483 + u64 mac; 484 + int ret; 485 + 486 + ret = erdma_dev_attrs_init(dev); 487 + if (ret) 488 + return ret; 489 + 490 + ibdev->node_type = RDMA_NODE_RNIC; 491 + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); 492 + 493 + /* 494 + * Current model (one-to-one device association): 495 + * One ERDMA device per net_device or, equivalently, 496 + * per physical port. 497 + */ 498 + ibdev->phys_port_cnt = 1; 499 + ibdev->num_comp_vectors = dev->attrs.irq_num - 1; 500 + 501 + ib_set_device_ops(ibdev, &erdma_device_ops); 502 + 503 + INIT_LIST_HEAD(&dev->cep_list); 504 + 505 + spin_lock_init(&dev->lock); 506 + xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1); 507 + xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1); 508 + dev->next_alloc_cqn = 1; 509 + dev->next_alloc_qpn = 1; 510 + 511 + ret = erdma_res_cb_init(dev); 512 + if (ret) 513 + return ret; 514 + 515 + spin_lock_init(&dev->db_bitmap_lock); 516 + bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); 517 + bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); 518 + 519 + atomic_set(&dev->num_ctx, 0); 520 + 521 + mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); 522 + mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32; 523 + 524 + u64_to_ether_addr(mac, dev->attrs.peer_addr); 525 + 526 + ret = erdma_device_register(dev); 527 + if (ret) 528 + goto err_out; 529 + 530 + return 0; 531 + 532 + err_out: 533 + xa_destroy(&dev->qp_xa); 534 + xa_destroy(&dev->cq_xa); 535 + 536 + erdma_res_cb_free(dev); 537 + 538 + return ret; 539 + } 540 + 541 + static void erdma_ib_device_remove(struct pci_dev *pdev) 542 + { 543 + struct erdma_dev *dev = pci_get_drvdata(pdev); 544 + 545 + unregister_netdevice_notifier(&dev->netdev_nb); 546 + ib_unregister_device(&dev->ibdev); 547 + 548 + erdma_res_cb_free(dev); 549 + xa_destroy(&dev->qp_xa); 550 + xa_destroy(&dev->cq_xa); 551 + } 552 + 553 + static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent) 554 + { 555 + int ret; 556 + 557 + ret = erdma_probe_dev(pdev); 558 + if (ret) 559 + return ret; 560 + 561 + ret = erdma_ib_device_add(pdev); 562 + if (ret) { 563 + erdma_remove_dev(pdev); 564 + return ret; 565 + } 566 + 567 + return 0; 568 + } 569 + 570 + static void erdma_remove(struct pci_dev *pdev) 571 + { 572 + erdma_ib_device_remove(pdev); 573 + erdma_remove_dev(pdev); 574 + } 575 + 576 + static struct pci_driver erdma_pci_driver = { 577 + .name = DRV_MODULE_NAME, 578 + .id_table = erdma_pci_tbl, 579 + .probe = erdma_probe, 580 + .remove = erdma_remove 581 + }; 582 + 583 + MODULE_DEVICE_TABLE(pci, erdma_pci_tbl); 584 + 585 + static __init int erdma_init_module(void) 586 + { 587 + int ret; 588 + 589 + ret = erdma_cm_init(); 590 + if (ret) 591 + return ret; 592 + 593 + ret = pci_register_driver(&erdma_pci_driver); 594 + if (ret) 595 + erdma_cm_exit(); 596 + 597 + return ret; 598 + } 599 + 600 + static void __exit erdma_exit_module(void) 601 + { 602 + pci_unregister_driver(&erdma_pci_driver); 603 + 604 + erdma_cm_exit(); 605 + } 606 + 607 + module_init(erdma_init_module); 608 + module_exit(erdma_exit_module);
+566
drivers/infiniband/hw/erdma/erdma_qp.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2021, Alibaba Group */ 6 + /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 7 + /* Copyright (c) 2008-2019, IBM Corporation */ 8 + 9 + #include <linux/errno.h> 10 + #include <linux/pci.h> 11 + #include <linux/scatterlist.h> 12 + #include <linux/types.h> 13 + 14 + #include <rdma/ib_user_verbs.h> 15 + #include <rdma/ib_verbs.h> 16 + 17 + #include "erdma.h" 18 + #include "erdma_cm.h" 19 + #include "erdma_verbs.h" 20 + 21 + void erdma_qp_llp_close(struct erdma_qp *qp) 22 + { 23 + struct erdma_qp_attrs qp_attrs; 24 + 25 + down_write(&qp->state_lock); 26 + 27 + switch (qp->attrs.state) { 28 + case ERDMA_QP_STATE_RTS: 29 + case ERDMA_QP_STATE_RTR: 30 + case ERDMA_QP_STATE_IDLE: 31 + case ERDMA_QP_STATE_TERMINATE: 32 + qp_attrs.state = ERDMA_QP_STATE_CLOSING; 33 + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); 34 + break; 35 + case ERDMA_QP_STATE_CLOSING: 36 + qp->attrs.state = ERDMA_QP_STATE_IDLE; 37 + break; 38 + default: 39 + break; 40 + } 41 + 42 + if (qp->cep) { 43 + erdma_cep_put(qp->cep); 44 + qp->cep = NULL; 45 + } 46 + 47 + up_write(&qp->state_lock); 48 + } 49 + 50 + struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) 51 + { 52 + struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id); 53 + 54 + if (qp) 55 + return &qp->ibqp; 56 + 57 + return NULL; 58 + } 59 + 60 + static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, 61 + struct erdma_qp_attrs *attrs, 62 + enum erdma_qp_attr_mask mask) 63 + { 64 + int ret; 65 + struct erdma_dev *dev = qp->dev; 66 + struct erdma_cmdq_modify_qp_req req; 67 + struct tcp_sock *tp; 68 + struct erdma_cep *cep = qp->cep; 69 + struct sockaddr_storage local_addr, remote_addr; 70 + 71 + if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) 72 + return -EINVAL; 73 + 74 + if (!(mask & ERDMA_QP_ATTR_MPA)) 75 + return -EINVAL; 76 + 77 + ret = getname_local(cep->sock, &local_addr); 78 + if (ret < 0) 79 + return ret; 80 + 81 + ret = getname_peer(cep->sock, &remote_addr); 82 + if (ret < 0) 83 + return ret; 84 + 85 + qp->attrs.state = ERDMA_QP_STATE_RTS; 86 + 87 + tp = tcp_sk(qp->cep->sock->sk); 88 + 89 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 90 + CMDQ_OPCODE_MODIFY_QP); 91 + 92 + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | 93 + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | 94 + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); 95 + 96 + req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); 97 + req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; 98 + req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; 99 + req.dport = to_sockaddr_in(remote_addr).sin_port; 100 + req.sport = to_sockaddr_in(local_addr).sin_port; 101 + 102 + req.send_nxt = tp->snd_nxt; 103 + /* rsvd tcp seq for mpa-rsp in server. */ 104 + if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) 105 + req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; 106 + req.recv_nxt = tp->rcv_nxt; 107 + 108 + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 109 + NULL); 110 + } 111 + 112 + static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, 113 + struct erdma_qp_attrs *attrs, 114 + enum erdma_qp_attr_mask mask) 115 + { 116 + struct erdma_dev *dev = qp->dev; 117 + struct erdma_cmdq_modify_qp_req req; 118 + 119 + qp->attrs.state = attrs->state; 120 + 121 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 122 + CMDQ_OPCODE_MODIFY_QP); 123 + 124 + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | 125 + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); 126 + 127 + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 128 + NULL); 129 + } 130 + 131 + int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, 132 + enum erdma_qp_attr_mask mask) 133 + { 134 + int drop_conn, ret = 0; 135 + 136 + if (!mask) 137 + return 0; 138 + 139 + if (!(mask & ERDMA_QP_ATTR_STATE)) 140 + return 0; 141 + 142 + switch (qp->attrs.state) { 143 + case ERDMA_QP_STATE_IDLE: 144 + case ERDMA_QP_STATE_RTR: 145 + if (attrs->state == ERDMA_QP_STATE_RTS) { 146 + ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); 147 + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { 148 + qp->attrs.state = ERDMA_QP_STATE_ERROR; 149 + if (qp->cep) { 150 + erdma_cep_put(qp->cep); 151 + qp->cep = NULL; 152 + } 153 + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); 154 + } 155 + break; 156 + case ERDMA_QP_STATE_RTS: 157 + drop_conn = 0; 158 + 159 + if (attrs->state == ERDMA_QP_STATE_CLOSING) { 160 + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); 161 + drop_conn = 1; 162 + } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { 163 + qp->attrs.state = ERDMA_QP_STATE_TERMINATE; 164 + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); 165 + drop_conn = 1; 166 + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { 167 + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); 168 + qp->attrs.state = ERDMA_QP_STATE_ERROR; 169 + drop_conn = 1; 170 + } 171 + 172 + if (drop_conn) 173 + erdma_qp_cm_drop(qp); 174 + 175 + break; 176 + case ERDMA_QP_STATE_TERMINATE: 177 + if (attrs->state == ERDMA_QP_STATE_ERROR) 178 + qp->attrs.state = ERDMA_QP_STATE_ERROR; 179 + break; 180 + case ERDMA_QP_STATE_CLOSING: 181 + if (attrs->state == ERDMA_QP_STATE_IDLE) { 182 + qp->attrs.state = ERDMA_QP_STATE_IDLE; 183 + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { 184 + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); 185 + qp->attrs.state = ERDMA_QP_STATE_ERROR; 186 + } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { 187 + return -ECONNABORTED; 188 + } 189 + break; 190 + default: 191 + break; 192 + } 193 + 194 + return ret; 195 + } 196 + 197 + static void erdma_qp_safe_free(struct kref *ref) 198 + { 199 + struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); 200 + 201 + complete(&qp->safe_free); 202 + } 203 + 204 + void erdma_qp_put(struct erdma_qp *qp) 205 + { 206 + WARN_ON(kref_read(&qp->ref) < 1); 207 + kref_put(&qp->ref, erdma_qp_safe_free); 208 + } 209 + 210 + void erdma_qp_get(struct erdma_qp *qp) 211 + { 212 + kref_get(&qp->ref); 213 + } 214 + 215 + static int fill_inline_data(struct erdma_qp *qp, 216 + const struct ib_send_wr *send_wr, u16 wqe_idx, 217 + u32 sgl_offset, __le32 *length_field) 218 + { 219 + u32 remain_size, copy_size, data_off, bytes = 0; 220 + char *data; 221 + int i = 0; 222 + 223 + wqe_idx += (sgl_offset >> SQEBB_SHIFT); 224 + sgl_offset &= (SQEBB_SIZE - 1); 225 + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size, 226 + SQEBB_SHIFT); 227 + 228 + while (i < send_wr->num_sge) { 229 + bytes += send_wr->sg_list[i].length; 230 + if (bytes > (int)ERDMA_MAX_INLINE) 231 + return -EINVAL; 232 + 233 + remain_size = send_wr->sg_list[i].length; 234 + data_off = 0; 235 + 236 + while (1) { 237 + copy_size = min(remain_size, SQEBB_SIZE - sgl_offset); 238 + 239 + memcpy(data + sgl_offset, 240 + (void *)(uintptr_t)send_wr->sg_list[i].addr + 241 + data_off, 242 + copy_size); 243 + remain_size -= copy_size; 244 + data_off += copy_size; 245 + sgl_offset += copy_size; 246 + wqe_idx += (sgl_offset >> SQEBB_SHIFT); 247 + sgl_offset &= (SQEBB_SIZE - 1); 248 + 249 + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, 250 + qp->attrs.sq_size, SQEBB_SHIFT); 251 + if (!remain_size) 252 + break; 253 + } 254 + 255 + i++; 256 + } 257 + *length_field = cpu_to_le32(bytes); 258 + 259 + return bytes; 260 + } 261 + 262 + static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, 263 + u16 wqe_idx, u32 sgl_offset, __le32 *length_field) 264 + { 265 + int i = 0; 266 + u32 bytes = 0; 267 + char *sgl; 268 + 269 + if (send_wr->num_sge > qp->dev->attrs.max_send_sge) 270 + return -EINVAL; 271 + 272 + if (sgl_offset & 0xF) 273 + return -EINVAL; 274 + 275 + while (i < send_wr->num_sge) { 276 + wqe_idx += (sgl_offset >> SQEBB_SHIFT); 277 + sgl_offset &= (SQEBB_SIZE - 1); 278 + sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, 279 + qp->attrs.sq_size, SQEBB_SHIFT); 280 + 281 + bytes += send_wr->sg_list[i].length; 282 + memcpy(sgl + sgl_offset, &send_wr->sg_list[i], 283 + sizeof(struct ib_sge)); 284 + 285 + sgl_offset += sizeof(struct ib_sge); 286 + i++; 287 + } 288 + 289 + *length_field = cpu_to_le32(bytes); 290 + return 0; 291 + } 292 + 293 + static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, 294 + const struct ib_send_wr *send_wr) 295 + { 296 + u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; 297 + u32 idx = *pi & (qp->attrs.sq_size - 1); 298 + enum ib_wr_opcode op = send_wr->opcode; 299 + struct erdma_readreq_sqe *read_sqe; 300 + struct erdma_reg_mr_sqe *regmr_sge; 301 + struct erdma_write_sqe *write_sqe; 302 + struct erdma_send_sqe *send_sqe; 303 + struct ib_rdma_wr *rdma_wr; 304 + struct erdma_mr *mr; 305 + __le32 *length_field; 306 + u64 wqe_hdr, *entry; 307 + struct ib_sge *sge; 308 + u32 attrs; 309 + int ret; 310 + 311 + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, 312 + SQEBB_SHIFT); 313 + 314 + /* Clear the SQE header section. */ 315 + *entry = 0; 316 + 317 + qp->kern_qp.swr_tbl[idx] = send_wr->wr_id; 318 + flags = send_wr->send_flags; 319 + wqe_hdr = FIELD_PREP( 320 + ERDMA_SQE_HDR_CE_MASK, 321 + ((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0); 322 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, 323 + flags & IB_SEND_SOLICITED ? 1 : 0); 324 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, 325 + flags & IB_SEND_FENCE ? 1 : 0); 326 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, 327 + flags & IB_SEND_INLINE ? 1 : 0); 328 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)); 329 + 330 + switch (op) { 331 + case IB_WR_RDMA_WRITE: 332 + case IB_WR_RDMA_WRITE_WITH_IMM: 333 + hw_op = ERDMA_OP_WRITE; 334 + if (op == IB_WR_RDMA_WRITE_WITH_IMM) 335 + hw_op = ERDMA_OP_WRITE_WITH_IMM; 336 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); 337 + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); 338 + write_sqe = (struct erdma_write_sqe *)entry; 339 + 340 + write_sqe->imm_data = send_wr->ex.imm_data; 341 + write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey); 342 + write_sqe->sink_to_h = 343 + cpu_to_le32(upper_32_bits(rdma_wr->remote_addr)); 344 + write_sqe->sink_to_l = 345 + cpu_to_le32(lower_32_bits(rdma_wr->remote_addr)); 346 + 347 + length_field = &write_sqe->length; 348 + wqe_size = sizeof(struct erdma_write_sqe); 349 + sgl_offset = wqe_size; 350 + break; 351 + case IB_WR_RDMA_READ: 352 + case IB_WR_RDMA_READ_WITH_INV: 353 + read_sqe = (struct erdma_readreq_sqe *)entry; 354 + if (unlikely(send_wr->num_sge != 1)) 355 + return -EINVAL; 356 + hw_op = ERDMA_OP_READ; 357 + if (op == IB_WR_RDMA_READ_WITH_INV) { 358 + hw_op = ERDMA_OP_READ_WITH_INV; 359 + read_sqe->invalid_stag = 360 + cpu_to_le32(send_wr->ex.invalidate_rkey); 361 + } 362 + 363 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); 364 + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); 365 + read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length); 366 + read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey); 367 + read_sqe->sink_to_l = 368 + cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr)); 369 + read_sqe->sink_to_h = 370 + cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr)); 371 + 372 + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, 373 + qp->attrs.sq_size, SQEBB_SHIFT); 374 + sge->addr = rdma_wr->remote_addr; 375 + sge->lkey = rdma_wr->rkey; 376 + sge->length = send_wr->sg_list[0].length; 377 + wqe_size = sizeof(struct erdma_readreq_sqe) + 378 + send_wr->num_sge * sizeof(struct ib_sge); 379 + 380 + goto out; 381 + case IB_WR_SEND: 382 + case IB_WR_SEND_WITH_IMM: 383 + case IB_WR_SEND_WITH_INV: 384 + send_sqe = (struct erdma_send_sqe *)entry; 385 + hw_op = ERDMA_OP_SEND; 386 + if (op == IB_WR_SEND_WITH_IMM) { 387 + hw_op = ERDMA_OP_SEND_WITH_IMM; 388 + send_sqe->imm_data = send_wr->ex.imm_data; 389 + } else if (op == IB_WR_SEND_WITH_INV) { 390 + hw_op = ERDMA_OP_SEND_WITH_INV; 391 + send_sqe->invalid_stag = 392 + cpu_to_le32(send_wr->ex.invalidate_rkey); 393 + } 394 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); 395 + length_field = &send_sqe->length; 396 + wqe_size = sizeof(struct erdma_send_sqe); 397 + sgl_offset = wqe_size; 398 + 399 + break; 400 + case IB_WR_REG_MR: 401 + wqe_hdr |= 402 + FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR); 403 + regmr_sge = (struct erdma_reg_mr_sqe *)entry; 404 + mr = to_emr(reg_wr(send_wr)->mr); 405 + 406 + mr->access = ERDMA_MR_ACC_LR | 407 + to_erdma_access_flags(reg_wr(send_wr)->access); 408 + regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); 409 + regmr_sge->length = cpu_to_le32(mr->ibmr.length); 410 + regmr_sge->stag = cpu_to_le32(mr->ibmr.lkey); 411 + attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | 412 + FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | 413 + FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, 414 + mr->mem.mtt_nents); 415 + 416 + if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { 417 + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); 418 + /* Copy SGLs to SQE content to accelerate */ 419 + memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, 420 + qp->attrs.sq_size, SQEBB_SHIFT), 421 + mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents)); 422 + wqe_size = sizeof(struct erdma_reg_mr_sqe) + 423 + MTT_SIZE(mr->mem.mtt_nents); 424 + } else { 425 + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1); 426 + wqe_size = sizeof(struct erdma_reg_mr_sqe); 427 + } 428 + 429 + regmr_sge->attrs = cpu_to_le32(attrs); 430 + goto out; 431 + case IB_WR_LOCAL_INV: 432 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, 433 + ERDMA_OP_LOCAL_INV); 434 + regmr_sge = (struct erdma_reg_mr_sqe *)entry; 435 + regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); 436 + wqe_size = sizeof(struct erdma_reg_mr_sqe); 437 + goto out; 438 + default: 439 + return -EOPNOTSUPP; 440 + } 441 + 442 + if (flags & IB_SEND_INLINE) { 443 + ret = fill_inline_data(qp, send_wr, idx, sgl_offset, 444 + length_field); 445 + if (ret < 0) 446 + return -EINVAL; 447 + wqe_size += ret; 448 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret); 449 + } else { 450 + ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field); 451 + if (ret) 452 + return -EINVAL; 453 + wqe_size += send_wr->num_sge * sizeof(struct ib_sge); 454 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, 455 + send_wr->num_sge); 456 + } 457 + 458 + out: 459 + wqebb_cnt = SQEBB_COUNT(wqe_size); 460 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); 461 + *pi += wqebb_cnt; 462 + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi); 463 + 464 + *entry = wqe_hdr; 465 + 466 + return 0; 467 + } 468 + 469 + static void kick_sq_db(struct erdma_qp *qp, u16 pi) 470 + { 471 + u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | 472 + FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); 473 + 474 + *(u64 *)qp->kern_qp.sq_db_info = db_data; 475 + writeq(db_data, qp->kern_qp.hw_sq_db); 476 + } 477 + 478 + int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, 479 + const struct ib_send_wr **bad_send_wr) 480 + { 481 + struct erdma_qp *qp = to_eqp(ibqp); 482 + int ret = 0; 483 + const struct ib_send_wr *wr = send_wr; 484 + unsigned long flags; 485 + u16 sq_pi; 486 + 487 + if (!send_wr) 488 + return -EINVAL; 489 + 490 + spin_lock_irqsave(&qp->lock, flags); 491 + sq_pi = qp->kern_qp.sq_pi; 492 + 493 + while (wr) { 494 + if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) { 495 + ret = -ENOMEM; 496 + *bad_send_wr = send_wr; 497 + break; 498 + } 499 + 500 + ret = erdma_push_one_sqe(qp, &sq_pi, wr); 501 + if (ret) { 502 + *bad_send_wr = wr; 503 + break; 504 + } 505 + qp->kern_qp.sq_pi = sq_pi; 506 + kick_sq_db(qp, sq_pi); 507 + 508 + wr = wr->next; 509 + } 510 + spin_unlock_irqrestore(&qp->lock, flags); 511 + 512 + return ret; 513 + } 514 + 515 + static int erdma_post_recv_one(struct erdma_qp *qp, 516 + const struct ib_recv_wr *recv_wr) 517 + { 518 + struct erdma_rqe *rqe = 519 + get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi, 520 + qp->attrs.rq_size, RQE_SHIFT); 521 + 522 + rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1); 523 + rqe->qpn = cpu_to_le32(QP_ID(qp)); 524 + 525 + if (recv_wr->num_sge == 0) { 526 + rqe->length = 0; 527 + } else if (recv_wr->num_sge == 1) { 528 + rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey); 529 + rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr); 530 + rqe->length = cpu_to_le32(recv_wr->sg_list[0].length); 531 + } else { 532 + return -EINVAL; 533 + } 534 + 535 + *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; 536 + writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); 537 + 538 + qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = 539 + recv_wr->wr_id; 540 + qp->kern_qp.rq_pi++; 541 + 542 + return 0; 543 + } 544 + 545 + int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, 546 + const struct ib_recv_wr **bad_recv_wr) 547 + { 548 + const struct ib_recv_wr *wr = recv_wr; 549 + struct erdma_qp *qp = to_eqp(ibqp); 550 + unsigned long flags; 551 + int ret; 552 + 553 + spin_lock_irqsave(&qp->lock, flags); 554 + 555 + while (wr) { 556 + ret = erdma_post_recv_one(qp, wr); 557 + if (ret) { 558 + *bad_recv_wr = wr; 559 + break; 560 + } 561 + wr = wr->next; 562 + } 563 + 564 + spin_unlock_irqrestore(&qp->lock, flags); 565 + return ret; 566 + }
+1460
drivers/infiniband/hw/erdma/erdma_verbs.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 8 + /* Copyright (c) 2008-2019, IBM Corporation */ 9 + 10 + /* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. */ 11 + 12 + #include <linux/errno.h> 13 + #include <linux/pci.h> 14 + #include <linux/types.h> 15 + #include <linux/uaccess.h> 16 + #include <linux/vmalloc.h> 17 + #include <net/addrconf.h> 18 + #include <rdma/erdma-abi.h> 19 + #include <rdma/ib_umem.h> 20 + #include <rdma/ib_user_verbs.h> 21 + #include <rdma/ib_verbs.h> 22 + #include <rdma/uverbs_ioctl.h> 23 + 24 + #include "erdma.h" 25 + #include "erdma_cm.h" 26 + #include "erdma_hw.h" 27 + #include "erdma_verbs.h" 28 + 29 + static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) 30 + { 31 + struct erdma_cmdq_create_qp_req req; 32 + struct erdma_pd *pd = to_epd(qp->ibqp.pd); 33 + struct erdma_uqp *user_qp; 34 + u64 resp0, resp1; 35 + int err; 36 + 37 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 38 + CMDQ_OPCODE_CREATE_QP); 39 + 40 + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK, 41 + ilog2(qp->attrs.sq_size)) | 42 + FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp)); 43 + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK, 44 + ilog2(qp->attrs.rq_size)) | 45 + FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); 46 + 47 + if (rdma_is_kernel_res(&qp->ibqp.res)) { 48 + u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT; 49 + 50 + req.sq_cqn_mtt_cfg = 51 + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, 52 + pgsz_range) | 53 + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); 54 + req.rq_cqn_mtt_cfg = 55 + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, 56 + pgsz_range) | 57 + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); 58 + 59 + req.sq_mtt_cfg = 60 + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) | 61 + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) | 62 + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, 63 + ERDMA_MR_INLINE_MTT); 64 + req.rq_mtt_cfg = req.sq_mtt_cfg; 65 + 66 + req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; 67 + req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; 68 + req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + 69 + (qp->attrs.sq_size << SQEBB_SHIFT); 70 + req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + 71 + (qp->attrs.rq_size << RQE_SHIFT); 72 + } else { 73 + user_qp = &qp->user_qp; 74 + req.sq_cqn_mtt_cfg = FIELD_PREP( 75 + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, 76 + ilog2(user_qp->sq_mtt.page_size) - PAGE_SHIFT); 77 + req.sq_cqn_mtt_cfg |= 78 + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); 79 + 80 + req.rq_cqn_mtt_cfg = FIELD_PREP( 81 + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, 82 + ilog2(user_qp->rq_mtt.page_size) - PAGE_SHIFT); 83 + req.rq_cqn_mtt_cfg |= 84 + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); 85 + 86 + req.sq_mtt_cfg = user_qp->sq_mtt.page_offset; 87 + req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 88 + user_qp->sq_mtt.mtt_nents) | 89 + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, 90 + user_qp->sq_mtt.mtt_type); 91 + 92 + req.rq_mtt_cfg = user_qp->rq_mtt.page_offset; 93 + req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 94 + user_qp->rq_mtt.mtt_nents) | 95 + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, 96 + user_qp->rq_mtt.mtt_type); 97 + 98 + req.sq_buf_addr = user_qp->sq_mtt.mtt_entry[0]; 99 + req.rq_buf_addr = user_qp->rq_mtt.mtt_entry[0]; 100 + 101 + req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; 102 + req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; 103 + } 104 + 105 + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), &resp0, 106 + &resp1); 107 + if (!err) 108 + qp->attrs.cookie = 109 + FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); 110 + 111 + return err; 112 + } 113 + 114 + static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) 115 + { 116 + struct erdma_cmdq_reg_mr_req req; 117 + struct erdma_pd *pd = to_epd(mr->ibmr.pd); 118 + u64 *phy_addr; 119 + int i; 120 + 121 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR); 122 + 123 + req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) | 124 + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) | 125 + FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); 126 + req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | 127 + FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | 128 + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | 129 + FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); 130 + req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, 131 + ilog2(mr->mem.page_size)) | 132 + FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | 133 + FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt); 134 + 135 + if (mr->type == ERDMA_MR_TYPE_DMA) 136 + goto post_cmd; 137 + 138 + if (mr->type == ERDMA_MR_TYPE_NORMAL) { 139 + req.start_va = mr->mem.va; 140 + req.size = mr->mem.len; 141 + } 142 + 143 + if (mr->type == ERDMA_MR_TYPE_FRMR || 144 + mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) { 145 + phy_addr = req.phy_addr; 146 + *phy_addr = mr->mem.mtt_entry[0]; 147 + } else { 148 + phy_addr = req.phy_addr; 149 + for (i = 0; i < mr->mem.mtt_nents; i++) 150 + *phy_addr++ = mr->mem.mtt_entry[i]; 151 + } 152 + 153 + post_cmd: 154 + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 155 + NULL); 156 + } 157 + 158 + static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) 159 + { 160 + struct erdma_cmdq_create_cq_req req; 161 + u32 page_size; 162 + struct erdma_mem *mtt; 163 + 164 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 165 + CMDQ_OPCODE_CREATE_CQ); 166 + 167 + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_CQN_MASK, cq->cqn) | 168 + FIELD_PREP(ERDMA_CMD_CREATE_CQ_DEPTH_MASK, ilog2(cq->depth)); 169 + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_EQN_MASK, cq->assoc_eqn); 170 + 171 + if (rdma_is_kernel_res(&cq->ibcq.res)) { 172 + page_size = SZ_32M; 173 + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, 174 + ilog2(page_size) - PAGE_SHIFT); 175 + req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr); 176 + req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr); 177 + 178 + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) | 179 + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, 180 + ERDMA_MR_INLINE_MTT); 181 + 182 + req.first_page_offset = 0; 183 + req.cq_db_info_addr = 184 + cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); 185 + } else { 186 + mtt = &cq->user_cq.qbuf_mtt; 187 + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, 188 + ilog2(mtt->page_size) - PAGE_SHIFT); 189 + if (mtt->mtt_nents == 1) { 190 + req.qbuf_addr_l = lower_32_bits(*(u64 *)mtt->mtt_buf); 191 + req.qbuf_addr_h = upper_32_bits(*(u64 *)mtt->mtt_buf); 192 + } else { 193 + req.qbuf_addr_l = lower_32_bits(mtt->mtt_entry[0]); 194 + req.qbuf_addr_h = upper_32_bits(mtt->mtt_entry[0]); 195 + } 196 + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 197 + mtt->mtt_nents); 198 + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, 199 + mtt->mtt_type); 200 + 201 + req.first_page_offset = mtt->page_offset; 202 + req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; 203 + } 204 + 205 + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 206 + NULL); 207 + } 208 + 209 + static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) 210 + { 211 + int idx; 212 + unsigned long flags; 213 + 214 + spin_lock_irqsave(&res_cb->lock, flags); 215 + idx = find_next_zero_bit(res_cb->bitmap, res_cb->max_cap, 216 + res_cb->next_alloc_idx); 217 + if (idx == res_cb->max_cap) { 218 + idx = find_first_zero_bit(res_cb->bitmap, res_cb->max_cap); 219 + if (idx == res_cb->max_cap) { 220 + res_cb->next_alloc_idx = 1; 221 + spin_unlock_irqrestore(&res_cb->lock, flags); 222 + return -ENOSPC; 223 + } 224 + } 225 + 226 + set_bit(idx, res_cb->bitmap); 227 + res_cb->next_alloc_idx = idx + 1; 228 + spin_unlock_irqrestore(&res_cb->lock, flags); 229 + 230 + return idx; 231 + } 232 + 233 + static inline void erdma_free_idx(struct erdma_resource_cb *res_cb, u32 idx) 234 + { 235 + unsigned long flags; 236 + u32 used; 237 + 238 + spin_lock_irqsave(&res_cb->lock, flags); 239 + used = __test_and_clear_bit(idx, res_cb->bitmap); 240 + spin_unlock_irqrestore(&res_cb->lock, flags); 241 + WARN_ON(!used); 242 + } 243 + 244 + static struct rdma_user_mmap_entry * 245 + erdma_user_mmap_entry_insert(struct erdma_ucontext *uctx, void *address, 246 + u32 size, u8 mmap_flag, u64 *mmap_offset) 247 + { 248 + struct erdma_user_mmap_entry *entry = 249 + kzalloc(sizeof(*entry), GFP_KERNEL); 250 + int ret; 251 + 252 + if (!entry) 253 + return NULL; 254 + 255 + entry->address = (u64)address; 256 + entry->mmap_flag = mmap_flag; 257 + 258 + size = PAGE_ALIGN(size); 259 + 260 + ret = rdma_user_mmap_entry_insert(&uctx->ibucontext, &entry->rdma_entry, 261 + size); 262 + if (ret) { 263 + kfree(entry); 264 + return NULL; 265 + } 266 + 267 + *mmap_offset = rdma_user_mmap_get_offset(&entry->rdma_entry); 268 + 269 + return &entry->rdma_entry; 270 + } 271 + 272 + int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, 273 + struct ib_udata *unused) 274 + { 275 + struct erdma_dev *dev = to_edev(ibdev); 276 + 277 + memset(attr, 0, sizeof(*attr)); 278 + 279 + attr->max_mr_size = dev->attrs.max_mr_size; 280 + attr->vendor_id = PCI_VENDOR_ID_ALIBABA; 281 + attr->vendor_part_id = dev->pdev->device; 282 + attr->hw_ver = dev->pdev->revision; 283 + attr->max_qp = dev->attrs.max_qp; 284 + attr->max_qp_wr = min(dev->attrs.max_send_wr, dev->attrs.max_recv_wr); 285 + attr->max_qp_rd_atom = dev->attrs.max_ord; 286 + attr->max_qp_init_rd_atom = dev->attrs.max_ird; 287 + attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird; 288 + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; 289 + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; 290 + ibdev->local_dma_lkey = dev->attrs.local_dma_key; 291 + attr->max_send_sge = dev->attrs.max_send_sge; 292 + attr->max_recv_sge = dev->attrs.max_recv_sge; 293 + attr->max_sge_rd = dev->attrs.max_sge_rd; 294 + attr->max_cq = dev->attrs.max_cq; 295 + attr->max_cqe = dev->attrs.max_cqe; 296 + attr->max_mr = dev->attrs.max_mr; 297 + attr->max_pd = dev->attrs.max_pd; 298 + attr->max_mw = dev->attrs.max_mw; 299 + attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; 300 + attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; 301 + attr->fw_ver = dev->attrs.fw_version; 302 + 303 + if (dev->netdev) 304 + addrconf_addr_eui48((u8 *)&attr->sys_image_guid, 305 + dev->netdev->dev_addr); 306 + 307 + return 0; 308 + } 309 + 310 + int erdma_query_gid(struct ib_device *ibdev, u32 port, int idx, 311 + union ib_gid *gid) 312 + { 313 + struct erdma_dev *dev = to_edev(ibdev); 314 + 315 + memset(gid, 0, sizeof(*gid)); 316 + ether_addr_copy(gid->raw, dev->attrs.peer_addr); 317 + 318 + return 0; 319 + } 320 + 321 + int erdma_query_port(struct ib_device *ibdev, u32 port, 322 + struct ib_port_attr *attr) 323 + { 324 + struct erdma_dev *dev = to_edev(ibdev); 325 + struct net_device *ndev = dev->netdev; 326 + 327 + memset(attr, 0, sizeof(*attr)); 328 + 329 + attr->gid_tbl_len = 1; 330 + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 331 + attr->max_msg_sz = -1; 332 + 333 + if (!ndev) 334 + goto out; 335 + 336 + ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); 337 + attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); 338 + attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); 339 + if (netif_running(ndev) && netif_carrier_ok(ndev)) 340 + dev->state = IB_PORT_ACTIVE; 341 + else 342 + dev->state = IB_PORT_DOWN; 343 + attr->state = dev->state; 344 + 345 + out: 346 + if (dev->state == IB_PORT_ACTIVE) 347 + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 348 + else 349 + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 350 + 351 + return 0; 352 + } 353 + 354 + int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, 355 + struct ib_port_immutable *port_immutable) 356 + { 357 + port_immutable->gid_tbl_len = 1; 358 + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 359 + 360 + return 0; 361 + } 362 + 363 + int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) 364 + { 365 + struct erdma_pd *pd = to_epd(ibpd); 366 + struct erdma_dev *dev = to_edev(ibpd->device); 367 + int pdn; 368 + 369 + pdn = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_PD]); 370 + if (pdn < 0) 371 + return pdn; 372 + 373 + pd->pdn = pdn; 374 + 375 + return 0; 376 + } 377 + 378 + int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) 379 + { 380 + struct erdma_pd *pd = to_epd(ibpd); 381 + struct erdma_dev *dev = to_edev(ibpd->device); 382 + 383 + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_PD], pd->pdn); 384 + 385 + return 0; 386 + } 387 + 388 + static int erdma_qp_validate_cap(struct erdma_dev *dev, 389 + struct ib_qp_init_attr *attrs) 390 + { 391 + if ((attrs->cap.max_send_wr > dev->attrs.max_send_wr) || 392 + (attrs->cap.max_recv_wr > dev->attrs.max_recv_wr) || 393 + (attrs->cap.max_send_sge > dev->attrs.max_send_sge) || 394 + (attrs->cap.max_recv_sge > dev->attrs.max_recv_sge) || 395 + (attrs->cap.max_inline_data > ERDMA_MAX_INLINE) || 396 + !attrs->cap.max_send_wr || !attrs->cap.max_recv_wr) { 397 + return -EINVAL; 398 + } 399 + 400 + return 0; 401 + } 402 + 403 + static int erdma_qp_validate_attr(struct erdma_dev *dev, 404 + struct ib_qp_init_attr *attrs) 405 + { 406 + if (attrs->qp_type != IB_QPT_RC) 407 + return -EOPNOTSUPP; 408 + 409 + if (attrs->srq) 410 + return -EOPNOTSUPP; 411 + 412 + if (!attrs->send_cq || !attrs->recv_cq) 413 + return -EOPNOTSUPP; 414 + 415 + return 0; 416 + } 417 + 418 + static void free_kernel_qp(struct erdma_qp *qp) 419 + { 420 + struct erdma_dev *dev = qp->dev; 421 + 422 + vfree(qp->kern_qp.swr_tbl); 423 + vfree(qp->kern_qp.rwr_tbl); 424 + 425 + if (qp->kern_qp.sq_buf) 426 + dma_free_coherent( 427 + &dev->pdev->dev, 428 + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), 429 + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); 430 + 431 + if (qp->kern_qp.rq_buf) 432 + dma_free_coherent( 433 + &dev->pdev->dev, 434 + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), 435 + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); 436 + } 437 + 438 + static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, 439 + struct ib_qp_init_attr *attrs) 440 + { 441 + struct erdma_kqp *kqp = &qp->kern_qp; 442 + int size; 443 + 444 + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 445 + kqp->sig_all = 1; 446 + 447 + kqp->sq_pi = 0; 448 + kqp->sq_ci = 0; 449 + kqp->rq_pi = 0; 450 + kqp->rq_ci = 0; 451 + kqp->hw_sq_db = 452 + dev->func_bar + (ERDMA_SDB_SHARED_PAGE_INDEX << PAGE_SHIFT); 453 + kqp->hw_rq_db = dev->func_bar + ERDMA_BAR_RQDB_SPACE_OFFSET; 454 + 455 + kqp->swr_tbl = vmalloc(qp->attrs.sq_size * sizeof(u64)); 456 + kqp->rwr_tbl = vmalloc(qp->attrs.rq_size * sizeof(u64)); 457 + if (!kqp->swr_tbl || !kqp->rwr_tbl) 458 + goto err_out; 459 + 460 + size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; 461 + kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, 462 + &kqp->sq_buf_dma_addr, GFP_KERNEL); 463 + if (!kqp->sq_buf) 464 + goto err_out; 465 + 466 + size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; 467 + kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, 468 + &kqp->rq_buf_dma_addr, GFP_KERNEL); 469 + if (!kqp->rq_buf) 470 + goto err_out; 471 + 472 + kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); 473 + kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); 474 + 475 + return 0; 476 + 477 + err_out: 478 + free_kernel_qp(qp); 479 + return -ENOMEM; 480 + } 481 + 482 + static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, 483 + u64 start, u64 len, int access, u64 virt, 484 + unsigned long req_page_size, u8 force_indirect_mtt) 485 + { 486 + struct ib_block_iter biter; 487 + uint64_t *phy_addr = NULL; 488 + int ret = 0; 489 + 490 + mem->umem = ib_umem_get(&dev->ibdev, start, len, access); 491 + if (IS_ERR(mem->umem)) { 492 + ret = PTR_ERR(mem->umem); 493 + mem->umem = NULL; 494 + return ret; 495 + } 496 + 497 + mem->va = virt; 498 + mem->len = len; 499 + mem->page_size = ib_umem_find_best_pgsz(mem->umem, req_page_size, virt); 500 + mem->page_offset = start & (mem->page_size - 1); 501 + mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size); 502 + mem->page_cnt = mem->mtt_nents; 503 + 504 + if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES || 505 + force_indirect_mtt) { 506 + mem->mtt_type = ERDMA_MR_INDIRECT_MTT; 507 + mem->mtt_buf = 508 + alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL); 509 + if (!mem->mtt_buf) { 510 + ret = -ENOMEM; 511 + goto error_ret; 512 + } 513 + phy_addr = mem->mtt_buf; 514 + } else { 515 + mem->mtt_type = ERDMA_MR_INLINE_MTT; 516 + phy_addr = mem->mtt_entry; 517 + } 518 + 519 + rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) { 520 + *phy_addr = rdma_block_iter_dma_address(&biter); 521 + phy_addr++; 522 + } 523 + 524 + if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) { 525 + mem->mtt_entry[0] = 526 + dma_map_single(&dev->pdev->dev, mem->mtt_buf, 527 + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); 528 + if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) { 529 + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); 530 + mem->mtt_buf = NULL; 531 + ret = -ENOMEM; 532 + goto error_ret; 533 + } 534 + } 535 + 536 + return 0; 537 + 538 + error_ret: 539 + if (mem->umem) { 540 + ib_umem_release(mem->umem); 541 + mem->umem = NULL; 542 + } 543 + 544 + return ret; 545 + } 546 + 547 + static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem) 548 + { 549 + if (mem->mtt_buf) { 550 + dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0], 551 + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); 552 + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); 553 + } 554 + 555 + if (mem->umem) { 556 + ib_umem_release(mem->umem); 557 + mem->umem = NULL; 558 + } 559 + } 560 + 561 + static int erdma_map_user_dbrecords(struct erdma_ucontext *ctx, 562 + u64 dbrecords_va, 563 + struct erdma_user_dbrecords_page **dbr_page, 564 + dma_addr_t *dma_addr) 565 + { 566 + struct erdma_user_dbrecords_page *page = NULL; 567 + int rv = 0; 568 + 569 + mutex_lock(&ctx->dbrecords_page_mutex); 570 + 571 + list_for_each_entry(page, &ctx->dbrecords_page_list, list) 572 + if (page->va == (dbrecords_va & PAGE_MASK)) 573 + goto found; 574 + 575 + page = kmalloc(sizeof(*page), GFP_KERNEL); 576 + if (!page) { 577 + rv = -ENOMEM; 578 + goto out; 579 + } 580 + 581 + page->va = (dbrecords_va & PAGE_MASK); 582 + page->refcnt = 0; 583 + 584 + page->umem = ib_umem_get(ctx->ibucontext.device, 585 + dbrecords_va & PAGE_MASK, PAGE_SIZE, 0); 586 + if (IS_ERR(page->umem)) { 587 + rv = PTR_ERR(page->umem); 588 + kfree(page); 589 + goto out; 590 + } 591 + 592 + list_add(&page->list, &ctx->dbrecords_page_list); 593 + 594 + found: 595 + *dma_addr = sg_dma_address(page->umem->sgt_append.sgt.sgl) + 596 + (dbrecords_va & ~PAGE_MASK); 597 + *dbr_page = page; 598 + page->refcnt++; 599 + 600 + out: 601 + mutex_unlock(&ctx->dbrecords_page_mutex); 602 + return rv; 603 + } 604 + 605 + static void 606 + erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, 607 + struct erdma_user_dbrecords_page **dbr_page) 608 + { 609 + if (!ctx || !(*dbr_page)) 610 + return; 611 + 612 + mutex_lock(&ctx->dbrecords_page_mutex); 613 + if (--(*dbr_page)->refcnt == 0) { 614 + list_del(&(*dbr_page)->list); 615 + ib_umem_release((*dbr_page)->umem); 616 + kfree(*dbr_page); 617 + } 618 + 619 + *dbr_page = NULL; 620 + mutex_unlock(&ctx->dbrecords_page_mutex); 621 + } 622 + 623 + static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, 624 + u64 va, u32 len, u64 db_info_va) 625 + { 626 + dma_addr_t db_info_dma_addr; 627 + u32 rq_offset; 628 + int ret; 629 + 630 + if (len < (PAGE_ALIGN(qp->attrs.sq_size * SQEBB_SIZE) + 631 + qp->attrs.rq_size * RQE_SIZE)) 632 + return -EINVAL; 633 + 634 + ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mtt, va, 635 + qp->attrs.sq_size << SQEBB_SHIFT, 0, va, 636 + (SZ_1M - SZ_4K), 1); 637 + if (ret) 638 + return ret; 639 + 640 + rq_offset = PAGE_ALIGN(qp->attrs.sq_size << SQEBB_SHIFT); 641 + qp->user_qp.rq_offset = rq_offset; 642 + 643 + ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mtt, va + rq_offset, 644 + qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset, 645 + (SZ_1M - SZ_4K), 1); 646 + if (ret) 647 + goto put_sq_mtt; 648 + 649 + ret = erdma_map_user_dbrecords(uctx, db_info_va, 650 + &qp->user_qp.user_dbr_page, 651 + &db_info_dma_addr); 652 + if (ret) 653 + goto put_rq_mtt; 654 + 655 + qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; 656 + qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; 657 + 658 + return 0; 659 + 660 + put_rq_mtt: 661 + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); 662 + 663 + put_sq_mtt: 664 + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); 665 + 666 + return ret; 667 + } 668 + 669 + static void free_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx) 670 + { 671 + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); 672 + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); 673 + erdma_unmap_user_dbrecords(uctx, &qp->user_qp.user_dbr_page); 674 + } 675 + 676 + int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, 677 + struct ib_udata *udata) 678 + { 679 + struct erdma_qp *qp = to_eqp(ibqp); 680 + struct erdma_dev *dev = to_edev(ibqp->device); 681 + struct erdma_ucontext *uctx = rdma_udata_to_drv_context( 682 + udata, struct erdma_ucontext, ibucontext); 683 + struct erdma_ureq_create_qp ureq; 684 + struct erdma_uresp_create_qp uresp; 685 + int ret; 686 + 687 + ret = erdma_qp_validate_cap(dev, attrs); 688 + if (ret) 689 + goto err_out; 690 + 691 + ret = erdma_qp_validate_attr(dev, attrs); 692 + if (ret) 693 + goto err_out; 694 + 695 + qp->scq = to_ecq(attrs->send_cq); 696 + qp->rcq = to_ecq(attrs->recv_cq); 697 + qp->dev = dev; 698 + qp->attrs.cc = dev->attrs.cc; 699 + 700 + init_rwsem(&qp->state_lock); 701 + kref_init(&qp->ref); 702 + init_completion(&qp->safe_free); 703 + 704 + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, 705 + XA_LIMIT(1, dev->attrs.max_qp - 1), 706 + &dev->next_alloc_qpn, GFP_KERNEL); 707 + if (ret < 0) { 708 + ret = -ENOMEM; 709 + goto err_out; 710 + } 711 + 712 + qp->attrs.sq_size = roundup_pow_of_two(attrs->cap.max_send_wr * 713 + ERDMA_MAX_WQEBB_PER_SQE); 714 + qp->attrs.rq_size = roundup_pow_of_two(attrs->cap.max_recv_wr); 715 + 716 + if (uctx) { 717 + ret = ib_copy_from_udata(&ureq, udata, 718 + min(sizeof(ureq), udata->inlen)); 719 + if (ret) 720 + goto err_out_xa; 721 + 722 + ret = init_user_qp(qp, uctx, ureq.qbuf_va, ureq.qbuf_len, 723 + ureq.db_record_va); 724 + if (ret) 725 + goto err_out_xa; 726 + 727 + memset(&uresp, 0, sizeof(uresp)); 728 + 729 + uresp.num_sqe = qp->attrs.sq_size; 730 + uresp.num_rqe = qp->attrs.rq_size; 731 + uresp.qp_id = QP_ID(qp); 732 + uresp.rq_offset = qp->user_qp.rq_offset; 733 + 734 + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 735 + if (ret) 736 + goto err_out_cmd; 737 + } else { 738 + init_kernel_qp(dev, qp, attrs); 739 + } 740 + 741 + qp->attrs.max_send_sge = attrs->cap.max_send_sge; 742 + qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; 743 + qp->attrs.state = ERDMA_QP_STATE_IDLE; 744 + 745 + ret = create_qp_cmd(dev, qp); 746 + if (ret) 747 + goto err_out_cmd; 748 + 749 + spin_lock_init(&qp->lock); 750 + 751 + return 0; 752 + 753 + err_out_cmd: 754 + if (uctx) 755 + free_user_qp(qp, uctx); 756 + else 757 + free_kernel_qp(qp); 758 + err_out_xa: 759 + xa_erase(&dev->qp_xa, QP_ID(qp)); 760 + err_out: 761 + return ret; 762 + } 763 + 764 + static int erdma_create_stag(struct erdma_dev *dev, u32 *stag) 765 + { 766 + int stag_idx; 767 + 768 + stag_idx = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX]); 769 + if (stag_idx < 0) 770 + return stag_idx; 771 + 772 + /* For now, we always let key field be zero. */ 773 + *stag = (stag_idx << 8); 774 + 775 + return 0; 776 + } 777 + 778 + struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int acc) 779 + { 780 + struct erdma_dev *dev = to_edev(ibpd->device); 781 + struct erdma_mr *mr; 782 + u32 stag; 783 + int ret; 784 + 785 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 786 + if (!mr) 787 + return ERR_PTR(-ENOMEM); 788 + 789 + ret = erdma_create_stag(dev, &stag); 790 + if (ret) 791 + goto out_free; 792 + 793 + mr->type = ERDMA_MR_TYPE_DMA; 794 + 795 + mr->ibmr.lkey = stag; 796 + mr->ibmr.rkey = stag; 797 + mr->ibmr.pd = ibpd; 798 + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(acc); 799 + ret = regmr_cmd(dev, mr); 800 + if (ret) 801 + goto out_remove_stag; 802 + 803 + return &mr->ibmr; 804 + 805 + out_remove_stag: 806 + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], 807 + mr->ibmr.lkey >> 8); 808 + 809 + out_free: 810 + kfree(mr); 811 + 812 + return ERR_PTR(ret); 813 + } 814 + 815 + struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, 816 + u32 max_num_sg) 817 + { 818 + struct erdma_mr *mr; 819 + struct erdma_dev *dev = to_edev(ibpd->device); 820 + int ret; 821 + u32 stag; 822 + 823 + if (mr_type != IB_MR_TYPE_MEM_REG) 824 + return ERR_PTR(-EOPNOTSUPP); 825 + 826 + if (max_num_sg > ERDMA_MR_MAX_MTT_CNT) 827 + return ERR_PTR(-EINVAL); 828 + 829 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 830 + if (!mr) 831 + return ERR_PTR(-ENOMEM); 832 + 833 + ret = erdma_create_stag(dev, &stag); 834 + if (ret) 835 + goto out_free; 836 + 837 + mr->type = ERDMA_MR_TYPE_FRMR; 838 + 839 + mr->ibmr.lkey = stag; 840 + mr->ibmr.rkey = stag; 841 + mr->ibmr.pd = ibpd; 842 + /* update it in FRMR. */ 843 + mr->access = ERDMA_MR_ACC_LR | ERDMA_MR_ACC_LW | ERDMA_MR_ACC_RR | 844 + ERDMA_MR_ACC_RW; 845 + 846 + mr->mem.page_size = PAGE_SIZE; /* update it later. */ 847 + mr->mem.page_cnt = max_num_sg; 848 + mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT; 849 + mr->mem.mtt_buf = 850 + alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL); 851 + if (!mr->mem.mtt_buf) { 852 + ret = -ENOMEM; 853 + goto out_remove_stag; 854 + } 855 + 856 + mr->mem.mtt_entry[0] = 857 + dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf, 858 + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); 859 + if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) { 860 + ret = -ENOMEM; 861 + goto out_free_mtt; 862 + } 863 + 864 + ret = regmr_cmd(dev, mr); 865 + if (ret) 866 + goto out_dma_unmap; 867 + 868 + return &mr->ibmr; 869 + 870 + out_dma_unmap: 871 + dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0], 872 + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); 873 + out_free_mtt: 874 + free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt)); 875 + 876 + out_remove_stag: 877 + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], 878 + mr->ibmr.lkey >> 8); 879 + 880 + out_free: 881 + kfree(mr); 882 + 883 + return ERR_PTR(ret); 884 + } 885 + 886 + static int erdma_set_page(struct ib_mr *ibmr, u64 addr) 887 + { 888 + struct erdma_mr *mr = to_emr(ibmr); 889 + 890 + if (mr->mem.mtt_nents >= mr->mem.page_cnt) 891 + return -1; 892 + 893 + *((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr; 894 + mr->mem.mtt_nents++; 895 + 896 + return 0; 897 + } 898 + 899 + int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 900 + unsigned int *sg_offset) 901 + { 902 + struct erdma_mr *mr = to_emr(ibmr); 903 + int num; 904 + 905 + mr->mem.mtt_nents = 0; 906 + 907 + num = ib_sg_to_pages(&mr->ibmr, sg, sg_nents, sg_offset, 908 + erdma_set_page); 909 + 910 + return num; 911 + } 912 + 913 + struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, 914 + u64 virt, int access, struct ib_udata *udata) 915 + { 916 + struct erdma_mr *mr = NULL; 917 + struct erdma_dev *dev = to_edev(ibpd->device); 918 + u32 stag; 919 + int ret; 920 + 921 + if (!len || len > dev->attrs.max_mr_size) 922 + return ERR_PTR(-EINVAL); 923 + 924 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 925 + if (!mr) 926 + return ERR_PTR(-ENOMEM); 927 + 928 + ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt, 929 + SZ_2G - SZ_4K, 0); 930 + if (ret) 931 + goto err_out_free; 932 + 933 + ret = erdma_create_stag(dev, &stag); 934 + if (ret) 935 + goto err_out_put_mtt; 936 + 937 + mr->ibmr.lkey = mr->ibmr.rkey = stag; 938 + mr->ibmr.pd = ibpd; 939 + mr->mem.va = virt; 940 + mr->mem.len = len; 941 + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(access); 942 + mr->valid = 1; 943 + mr->type = ERDMA_MR_TYPE_NORMAL; 944 + 945 + ret = regmr_cmd(dev, mr); 946 + if (ret) 947 + goto err_out_mr; 948 + 949 + return &mr->ibmr; 950 + 951 + err_out_mr: 952 + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], 953 + mr->ibmr.lkey >> 8); 954 + 955 + err_out_put_mtt: 956 + put_mtt_entries(dev, &mr->mem); 957 + 958 + err_out_free: 959 + kfree(mr); 960 + 961 + return ERR_PTR(ret); 962 + } 963 + 964 + int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 965 + { 966 + struct erdma_mr *mr; 967 + struct erdma_dev *dev = to_edev(ibmr->device); 968 + struct erdma_cmdq_dereg_mr_req req; 969 + int ret; 970 + 971 + mr = to_emr(ibmr); 972 + 973 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 974 + CMDQ_OPCODE_DEREG_MR); 975 + 976 + req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | 977 + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); 978 + 979 + ret = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 980 + NULL); 981 + if (ret) 982 + return ret; 983 + 984 + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], ibmr->lkey >> 8); 985 + 986 + put_mtt_entries(dev, &mr->mem); 987 + 988 + kfree(mr); 989 + return 0; 990 + } 991 + 992 + int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) 993 + { 994 + struct erdma_cq *cq = to_ecq(ibcq); 995 + struct erdma_dev *dev = to_edev(ibcq->device); 996 + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( 997 + udata, struct erdma_ucontext, ibucontext); 998 + int err; 999 + struct erdma_cmdq_destroy_cq_req req; 1000 + 1001 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 1002 + CMDQ_OPCODE_DESTROY_CQ); 1003 + req.cqn = cq->cqn; 1004 + 1005 + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 1006 + NULL); 1007 + if (err) 1008 + return err; 1009 + 1010 + if (rdma_is_kernel_res(&cq->ibcq.res)) { 1011 + dma_free_coherent(&dev->pdev->dev, 1012 + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), 1013 + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); 1014 + } else { 1015 + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); 1016 + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); 1017 + } 1018 + 1019 + xa_erase(&dev->cq_xa, cq->cqn); 1020 + 1021 + return 0; 1022 + } 1023 + 1024 + int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) 1025 + { 1026 + struct erdma_qp *qp = to_eqp(ibqp); 1027 + struct erdma_dev *dev = to_edev(ibqp->device); 1028 + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( 1029 + udata, struct erdma_ucontext, ibucontext); 1030 + struct erdma_qp_attrs qp_attrs; 1031 + int err; 1032 + struct erdma_cmdq_destroy_qp_req req; 1033 + 1034 + down_write(&qp->state_lock); 1035 + qp_attrs.state = ERDMA_QP_STATE_ERROR; 1036 + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); 1037 + up_write(&qp->state_lock); 1038 + 1039 + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, 1040 + CMDQ_OPCODE_DESTROY_QP); 1041 + req.qpn = QP_ID(qp); 1042 + 1043 + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, 1044 + NULL); 1045 + if (err) 1046 + return err; 1047 + 1048 + erdma_qp_put(qp); 1049 + wait_for_completion(&qp->safe_free); 1050 + 1051 + if (rdma_is_kernel_res(&qp->ibqp.res)) { 1052 + vfree(qp->kern_qp.swr_tbl); 1053 + vfree(qp->kern_qp.rwr_tbl); 1054 + dma_free_coherent( 1055 + &dev->pdev->dev, 1056 + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), 1057 + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); 1058 + dma_free_coherent( 1059 + &dev->pdev->dev, 1060 + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), 1061 + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); 1062 + } else { 1063 + put_mtt_entries(dev, &qp->user_qp.sq_mtt); 1064 + put_mtt_entries(dev, &qp->user_qp.rq_mtt); 1065 + erdma_unmap_user_dbrecords(ctx, &qp->user_qp.user_dbr_page); 1066 + } 1067 + 1068 + if (qp->cep) 1069 + erdma_cep_put(qp->cep); 1070 + xa_erase(&dev->qp_xa, QP_ID(qp)); 1071 + 1072 + return 0; 1073 + } 1074 + 1075 + void erdma_qp_get_ref(struct ib_qp *ibqp) 1076 + { 1077 + erdma_qp_get(to_eqp(ibqp)); 1078 + } 1079 + 1080 + void erdma_qp_put_ref(struct ib_qp *ibqp) 1081 + { 1082 + erdma_qp_put(to_eqp(ibqp)); 1083 + } 1084 + 1085 + int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 1086 + { 1087 + struct rdma_user_mmap_entry *rdma_entry; 1088 + struct erdma_user_mmap_entry *entry; 1089 + pgprot_t prot; 1090 + int err; 1091 + 1092 + rdma_entry = rdma_user_mmap_entry_get(ctx, vma); 1093 + if (!rdma_entry) 1094 + return -EINVAL; 1095 + 1096 + entry = to_emmap(rdma_entry); 1097 + 1098 + switch (entry->mmap_flag) { 1099 + case ERDMA_MMAP_IO_NC: 1100 + /* map doorbell. */ 1101 + prot = pgprot_device(vma->vm_page_prot); 1102 + break; 1103 + default: 1104 + return -EINVAL; 1105 + } 1106 + 1107 + err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE, 1108 + prot, rdma_entry); 1109 + 1110 + rdma_user_mmap_entry_put(rdma_entry); 1111 + return err; 1112 + } 1113 + 1114 + void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) 1115 + { 1116 + struct erdma_user_mmap_entry *entry = to_emmap(rdma_entry); 1117 + 1118 + kfree(entry); 1119 + } 1120 + 1121 + #define ERDMA_SDB_PAGE 0 1122 + #define ERDMA_SDB_ENTRY 1 1123 + #define ERDMA_SDB_SHARED 2 1124 + 1125 + static void alloc_db_resources(struct erdma_dev *dev, 1126 + struct erdma_ucontext *ctx) 1127 + { 1128 + u32 bitmap_idx; 1129 + struct erdma_devattr *attrs = &dev->attrs; 1130 + 1131 + if (attrs->disable_dwqe) 1132 + goto alloc_normal_db; 1133 + 1134 + /* Try to alloc independent SDB page. */ 1135 + spin_lock(&dev->db_bitmap_lock); 1136 + bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); 1137 + if (bitmap_idx != attrs->dwqe_pages) { 1138 + set_bit(bitmap_idx, dev->sdb_page); 1139 + spin_unlock(&dev->db_bitmap_lock); 1140 + 1141 + ctx->sdb_type = ERDMA_SDB_PAGE; 1142 + ctx->sdb_idx = bitmap_idx; 1143 + ctx->sdb_page_idx = bitmap_idx; 1144 + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + 1145 + (bitmap_idx << PAGE_SHIFT); 1146 + ctx->sdb_page_off = 0; 1147 + 1148 + return; 1149 + } 1150 + 1151 + bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); 1152 + if (bitmap_idx != attrs->dwqe_entries) { 1153 + set_bit(bitmap_idx, dev->sdb_entry); 1154 + spin_unlock(&dev->db_bitmap_lock); 1155 + 1156 + ctx->sdb_type = ERDMA_SDB_ENTRY; 1157 + ctx->sdb_idx = bitmap_idx; 1158 + ctx->sdb_page_idx = attrs->dwqe_pages + 1159 + bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; 1160 + ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; 1161 + 1162 + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + 1163 + (ctx->sdb_page_idx << PAGE_SHIFT); 1164 + 1165 + return; 1166 + } 1167 + 1168 + spin_unlock(&dev->db_bitmap_lock); 1169 + 1170 + alloc_normal_db: 1171 + ctx->sdb_type = ERDMA_SDB_SHARED; 1172 + ctx->sdb_idx = 0; 1173 + ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; 1174 + ctx->sdb_page_off = 0; 1175 + 1176 + ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); 1177 + } 1178 + 1179 + static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) 1180 + { 1181 + rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry); 1182 + rdma_user_mmap_entry_remove(uctx->rq_db_mmap_entry); 1183 + rdma_user_mmap_entry_remove(uctx->cq_db_mmap_entry); 1184 + } 1185 + 1186 + int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) 1187 + { 1188 + struct erdma_ucontext *ctx = to_ectx(ibctx); 1189 + struct erdma_dev *dev = to_edev(ibctx->device); 1190 + int ret; 1191 + struct erdma_uresp_alloc_ctx uresp = {}; 1192 + 1193 + if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { 1194 + ret = -ENOMEM; 1195 + goto err_out; 1196 + } 1197 + 1198 + INIT_LIST_HEAD(&ctx->dbrecords_page_list); 1199 + mutex_init(&ctx->dbrecords_page_mutex); 1200 + 1201 + alloc_db_resources(dev, ctx); 1202 + 1203 + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; 1204 + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; 1205 + 1206 + if (udata->outlen < sizeof(uresp)) { 1207 + ret = -EINVAL; 1208 + goto err_out; 1209 + } 1210 + 1211 + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( 1212 + ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); 1213 + if (!ctx->sq_db_mmap_entry) { 1214 + ret = -ENOMEM; 1215 + goto err_out; 1216 + } 1217 + 1218 + ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( 1219 + ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); 1220 + if (!ctx->rq_db_mmap_entry) { 1221 + ret = -EINVAL; 1222 + goto err_out; 1223 + } 1224 + 1225 + ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( 1226 + ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); 1227 + if (!ctx->cq_db_mmap_entry) { 1228 + ret = -EINVAL; 1229 + goto err_out; 1230 + } 1231 + 1232 + uresp.dev_id = dev->pdev->device; 1233 + uresp.sdb_type = ctx->sdb_type; 1234 + uresp.sdb_offset = ctx->sdb_page_off; 1235 + 1236 + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1237 + if (ret) 1238 + goto err_out; 1239 + 1240 + return 0; 1241 + 1242 + err_out: 1243 + erdma_uctx_user_mmap_entries_remove(ctx); 1244 + atomic_dec(&dev->num_ctx); 1245 + return ret; 1246 + } 1247 + 1248 + void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) 1249 + { 1250 + struct erdma_ucontext *ctx = to_ectx(ibctx); 1251 + struct erdma_dev *dev = to_edev(ibctx->device); 1252 + 1253 + spin_lock(&dev->db_bitmap_lock); 1254 + if (ctx->sdb_type == ERDMA_SDB_PAGE) 1255 + clear_bit(ctx->sdb_idx, dev->sdb_page); 1256 + else if (ctx->sdb_type == ERDMA_SDB_ENTRY) 1257 + clear_bit(ctx->sdb_idx, dev->sdb_entry); 1258 + 1259 + erdma_uctx_user_mmap_entries_remove(ctx); 1260 + 1261 + spin_unlock(&dev->db_bitmap_lock); 1262 + 1263 + atomic_dec(&dev->num_ctx); 1264 + } 1265 + 1266 + static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { 1267 + [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, 1268 + [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, 1269 + [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, 1270 + [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, 1271 + [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, 1272 + [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, 1273 + [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR 1274 + }; 1275 + 1276 + int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, 1277 + struct ib_udata *udata) 1278 + { 1279 + struct erdma_qp_attrs new_attrs; 1280 + enum erdma_qp_attr_mask erdma_attr_mask = 0; 1281 + struct erdma_qp *qp = to_eqp(ibqp); 1282 + int ret = 0; 1283 + 1284 + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) 1285 + return -EOPNOTSUPP; 1286 + 1287 + memset(&new_attrs, 0, sizeof(new_attrs)); 1288 + 1289 + if (attr_mask & IB_QP_STATE) { 1290 + new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; 1291 + 1292 + erdma_attr_mask |= ERDMA_QP_ATTR_STATE; 1293 + } 1294 + 1295 + down_write(&qp->state_lock); 1296 + 1297 + ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); 1298 + 1299 + up_write(&qp->state_lock); 1300 + 1301 + return ret; 1302 + } 1303 + 1304 + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, 1305 + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 1306 + { 1307 + struct erdma_qp *qp; 1308 + struct erdma_dev *dev; 1309 + 1310 + if (ibqp && qp_attr && qp_init_attr) { 1311 + qp = to_eqp(ibqp); 1312 + dev = to_edev(ibqp->device); 1313 + } else { 1314 + return -EINVAL; 1315 + } 1316 + 1317 + qp_attr->cap.max_inline_data = ERDMA_MAX_INLINE; 1318 + qp_init_attr->cap.max_inline_data = ERDMA_MAX_INLINE; 1319 + 1320 + qp_attr->cap.max_send_wr = qp->attrs.sq_size; 1321 + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 1322 + qp_attr->cap.max_send_sge = qp->attrs.max_send_sge; 1323 + qp_attr->cap.max_recv_sge = qp->attrs.max_recv_sge; 1324 + 1325 + qp_attr->path_mtu = ib_mtu_int_to_enum(dev->netdev->mtu); 1326 + qp_attr->max_rd_atomic = qp->attrs.irq_size; 1327 + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 1328 + 1329 + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 1330 + IB_ACCESS_REMOTE_WRITE | 1331 + IB_ACCESS_REMOTE_READ; 1332 + 1333 + qp_init_attr->cap = qp_attr->cap; 1334 + 1335 + return 0; 1336 + } 1337 + 1338 + static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, 1339 + struct erdma_ureq_create_cq *ureq) 1340 + { 1341 + int ret; 1342 + struct erdma_dev *dev = to_edev(cq->ibcq.device); 1343 + 1344 + ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mtt, ureq->qbuf_va, 1345 + ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K, 1346 + 1); 1347 + if (ret) 1348 + return ret; 1349 + 1350 + ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, 1351 + &cq->user_cq.user_dbr_page, 1352 + &cq->user_cq.db_info_dma_addr); 1353 + if (ret) 1354 + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); 1355 + 1356 + return ret; 1357 + } 1358 + 1359 + static int erdma_init_kernel_cq(struct erdma_cq *cq) 1360 + { 1361 + struct erdma_dev *dev = to_edev(cq->ibcq.device); 1362 + 1363 + cq->kern_cq.qbuf = 1364 + dma_alloc_coherent(&dev->pdev->dev, 1365 + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), 1366 + &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); 1367 + if (!cq->kern_cq.qbuf) 1368 + return -ENOMEM; 1369 + 1370 + cq->kern_cq.db_record = 1371 + (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); 1372 + spin_lock_init(&cq->kern_cq.lock); 1373 + /* use default cqdb addr */ 1374 + cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; 1375 + 1376 + return 0; 1377 + } 1378 + 1379 + int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, 1380 + struct ib_udata *udata) 1381 + { 1382 + struct erdma_cq *cq = to_ecq(ibcq); 1383 + struct erdma_dev *dev = to_edev(ibcq->device); 1384 + unsigned int depth = attr->cqe; 1385 + int ret; 1386 + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( 1387 + udata, struct erdma_ucontext, ibucontext); 1388 + 1389 + if (depth > dev->attrs.max_cqe) 1390 + return -EINVAL; 1391 + 1392 + depth = roundup_pow_of_two(depth); 1393 + cq->ibcq.cqe = depth; 1394 + cq->depth = depth; 1395 + cq->assoc_eqn = attr->comp_vector + 1; 1396 + 1397 + ret = xa_alloc_cyclic(&dev->cq_xa, &cq->cqn, cq, 1398 + XA_LIMIT(1, dev->attrs.max_cq - 1), 1399 + &dev->next_alloc_cqn, GFP_KERNEL); 1400 + if (ret < 0) 1401 + return ret; 1402 + 1403 + if (!rdma_is_kernel_res(&ibcq->res)) { 1404 + struct erdma_ureq_create_cq ureq; 1405 + struct erdma_uresp_create_cq uresp; 1406 + 1407 + ret = ib_copy_from_udata(&ureq, udata, 1408 + min(udata->inlen, sizeof(ureq))); 1409 + if (ret) 1410 + goto err_out_xa; 1411 + 1412 + ret = erdma_init_user_cq(ctx, cq, &ureq); 1413 + if (ret) 1414 + goto err_out_xa; 1415 + 1416 + uresp.cq_id = cq->cqn; 1417 + uresp.num_cqe = depth; 1418 + 1419 + ret = ib_copy_to_udata(udata, &uresp, 1420 + min(sizeof(uresp), udata->outlen)); 1421 + if (ret) 1422 + goto err_free_res; 1423 + } else { 1424 + ret = erdma_init_kernel_cq(cq); 1425 + if (ret) 1426 + goto err_out_xa; 1427 + } 1428 + 1429 + ret = create_cq_cmd(dev, cq); 1430 + if (ret) 1431 + goto err_free_res; 1432 + 1433 + return 0; 1434 + 1435 + err_free_res: 1436 + if (!rdma_is_kernel_res(&ibcq->res)) { 1437 + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); 1438 + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); 1439 + } else { 1440 + dma_free_coherent(&dev->pdev->dev, 1441 + WARPPED_BUFSIZE(depth << CQE_SHIFT), 1442 + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); 1443 + } 1444 + 1445 + err_out_xa: 1446 + xa_erase(&dev->cq_xa, cq->cqn); 1447 + 1448 + return ret; 1449 + } 1450 + 1451 + void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) 1452 + { 1453 + struct ib_event event; 1454 + 1455 + event.device = &dev->ibdev; 1456 + event.element.port_num = 1; 1457 + event.event = reason; 1458 + 1459 + ib_dispatch_event(&event); 1460 + }
+342
drivers/infiniband/hw/erdma/erdma_verbs.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ 2 + 3 + /* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ 4 + /* Kai Shen <kaishen@linux.alibaba.com> */ 5 + /* Copyright (c) 2020-2022, Alibaba Group. */ 6 + 7 + #ifndef __ERDMA_VERBS_H__ 8 + #define __ERDMA_VERBS_H__ 9 + 10 + #include <linux/errno.h> 11 + 12 + #include <rdma/ib_verbs.h> 13 + #include <rdma/ib_user_verbs.h> 14 + #include <rdma/iw_cm.h> 15 + 16 + #include "erdma.h" 17 + #include "erdma_cm.h" 18 + #include "erdma_hw.h" 19 + 20 + /* RDMA Capability. */ 21 + #define ERDMA_MAX_PD (128 * 1024) 22 + #define ERDMA_MAX_SEND_WR 4096 23 + #define ERDMA_MAX_ORD 128 24 + #define ERDMA_MAX_IRD 128 25 + #define ERDMA_MAX_SGE_RD 1 26 + #define ERDMA_MAX_CONTEXT (128 * 1024) 27 + #define ERDMA_MAX_SEND_SGE 6 28 + #define ERDMA_MAX_RECV_SGE 1 29 + #define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE)) 30 + #define ERDMA_MAX_FRMR_PA 512 31 + 32 + enum { 33 + ERDMA_MMAP_IO_NC = 0, /* no cache */ 34 + }; 35 + 36 + struct erdma_user_mmap_entry { 37 + struct rdma_user_mmap_entry rdma_entry; 38 + u64 address; 39 + u8 mmap_flag; 40 + }; 41 + 42 + struct erdma_ucontext { 43 + struct ib_ucontext ibucontext; 44 + 45 + u32 sdb_type; 46 + u32 sdb_idx; 47 + u32 sdb_page_idx; 48 + u32 sdb_page_off; 49 + u64 sdb; 50 + u64 rdb; 51 + u64 cdb; 52 + 53 + struct rdma_user_mmap_entry *sq_db_mmap_entry; 54 + struct rdma_user_mmap_entry *rq_db_mmap_entry; 55 + struct rdma_user_mmap_entry *cq_db_mmap_entry; 56 + 57 + /* doorbell records */ 58 + struct list_head dbrecords_page_list; 59 + struct mutex dbrecords_page_mutex; 60 + }; 61 + 62 + struct erdma_pd { 63 + struct ib_pd ibpd; 64 + u32 pdn; 65 + }; 66 + 67 + /* 68 + * MemoryRegion definition. 69 + */ 70 + #define ERDMA_MAX_INLINE_MTT_ENTRIES 4 71 + #define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */ 72 + #define ERDMA_MR_MAX_MTT_CNT 524288 73 + #define ERDMA_MTT_ENTRY_SIZE 8 74 + 75 + #define ERDMA_MR_TYPE_NORMAL 0 76 + #define ERDMA_MR_TYPE_FRMR 1 77 + #define ERDMA_MR_TYPE_DMA 2 78 + 79 + #define ERDMA_MR_INLINE_MTT 0 80 + #define ERDMA_MR_INDIRECT_MTT 1 81 + 82 + #define ERDMA_MR_ACC_LR BIT(0) 83 + #define ERDMA_MR_ACC_LW BIT(1) 84 + #define ERDMA_MR_ACC_RR BIT(2) 85 + #define ERDMA_MR_ACC_RW BIT(3) 86 + 87 + static inline u8 to_erdma_access_flags(int access) 88 + { 89 + return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | 90 + (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | 91 + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); 92 + } 93 + 94 + struct erdma_mem { 95 + struct ib_umem *umem; 96 + void *mtt_buf; 97 + u32 mtt_type; 98 + u32 page_size; 99 + u32 page_offset; 100 + u32 page_cnt; 101 + u32 mtt_nents; 102 + 103 + u64 va; 104 + u64 len; 105 + 106 + u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES]; 107 + }; 108 + 109 + struct erdma_mr { 110 + struct ib_mr ibmr; 111 + struct erdma_mem mem; 112 + u8 type; 113 + u8 access; 114 + u8 valid; 115 + }; 116 + 117 + struct erdma_user_dbrecords_page { 118 + struct list_head list; 119 + struct ib_umem *umem; 120 + u64 va; 121 + int refcnt; 122 + }; 123 + 124 + struct erdma_uqp { 125 + struct erdma_mem sq_mtt; 126 + struct erdma_mem rq_mtt; 127 + 128 + dma_addr_t sq_db_info_dma_addr; 129 + dma_addr_t rq_db_info_dma_addr; 130 + 131 + struct erdma_user_dbrecords_page *user_dbr_page; 132 + 133 + u32 rq_offset; 134 + }; 135 + 136 + struct erdma_kqp { 137 + u16 sq_pi; 138 + u16 sq_ci; 139 + 140 + u16 rq_pi; 141 + u16 rq_ci; 142 + 143 + u64 *swr_tbl; 144 + u64 *rwr_tbl; 145 + 146 + void __iomem *hw_sq_db; 147 + void __iomem *hw_rq_db; 148 + 149 + void *sq_buf; 150 + dma_addr_t sq_buf_dma_addr; 151 + 152 + void *rq_buf; 153 + dma_addr_t rq_buf_dma_addr; 154 + 155 + void *sq_db_info; 156 + void *rq_db_info; 157 + 158 + u8 sig_all; 159 + }; 160 + 161 + enum erdma_qp_state { 162 + ERDMA_QP_STATE_IDLE = 0, 163 + ERDMA_QP_STATE_RTR = 1, 164 + ERDMA_QP_STATE_RTS = 2, 165 + ERDMA_QP_STATE_CLOSING = 3, 166 + ERDMA_QP_STATE_TERMINATE = 4, 167 + ERDMA_QP_STATE_ERROR = 5, 168 + ERDMA_QP_STATE_UNDEF = 7, 169 + ERDMA_QP_STATE_COUNT = 8 170 + }; 171 + 172 + enum erdma_qp_attr_mask { 173 + ERDMA_QP_ATTR_STATE = (1 << 0), 174 + ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), 175 + ERDMA_QP_ATTR_ORD = (1 << 3), 176 + ERDMA_QP_ATTR_IRD = (1 << 4), 177 + ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), 178 + ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), 179 + ERDMA_QP_ATTR_MPA = (1 << 7) 180 + }; 181 + 182 + struct erdma_qp_attrs { 183 + enum erdma_qp_state state; 184 + enum erdma_cc_alg cc; /* Congestion control algorithm */ 185 + u32 sq_size; 186 + u32 rq_size; 187 + u32 orq_size; 188 + u32 irq_size; 189 + u32 max_send_sge; 190 + u32 max_recv_sge; 191 + u32 cookie; 192 + #define ERDMA_QP_ACTIVE 0 193 + #define ERDMA_QP_PASSIVE 1 194 + u8 qp_type; 195 + u8 pd_len; 196 + }; 197 + 198 + struct erdma_qp { 199 + struct ib_qp ibqp; 200 + struct kref ref; 201 + struct completion safe_free; 202 + struct erdma_dev *dev; 203 + struct erdma_cep *cep; 204 + struct rw_semaphore state_lock; 205 + 206 + union { 207 + struct erdma_kqp kern_qp; 208 + struct erdma_uqp user_qp; 209 + }; 210 + 211 + struct erdma_cq *scq; 212 + struct erdma_cq *rcq; 213 + 214 + struct erdma_qp_attrs attrs; 215 + spinlock_t lock; 216 + }; 217 + 218 + struct erdma_kcq_info { 219 + void *qbuf; 220 + dma_addr_t qbuf_dma_addr; 221 + u32 ci; 222 + u32 cmdsn; 223 + u32 notify_cnt; 224 + 225 + spinlock_t lock; 226 + u8 __iomem *db; 227 + u64 *db_record; 228 + }; 229 + 230 + struct erdma_ucq_info { 231 + struct erdma_mem qbuf_mtt; 232 + struct erdma_user_dbrecords_page *user_dbr_page; 233 + dma_addr_t db_info_dma_addr; 234 + }; 235 + 236 + struct erdma_cq { 237 + struct ib_cq ibcq; 238 + u32 cqn; 239 + 240 + u32 depth; 241 + u32 assoc_eqn; 242 + 243 + union { 244 + struct erdma_kcq_info kern_cq; 245 + struct erdma_ucq_info user_cq; 246 + }; 247 + }; 248 + 249 + #define QP_ID(qp) ((qp)->ibqp.qp_num) 250 + 251 + static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id) 252 + { 253 + return (struct erdma_qp *)xa_load(&dev->qp_xa, id); 254 + } 255 + 256 + static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) 257 + { 258 + return (struct erdma_cq *)xa_load(&dev->cq_xa, id); 259 + } 260 + 261 + void erdma_qp_get(struct erdma_qp *qp); 262 + void erdma_qp_put(struct erdma_qp *qp); 263 + int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, 264 + enum erdma_qp_attr_mask mask); 265 + void erdma_qp_llp_close(struct erdma_qp *qp); 266 + void erdma_qp_cm_drop(struct erdma_qp *qp); 267 + 268 + static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) 269 + { 270 + return container_of(ibctx, struct erdma_ucontext, ibucontext); 271 + } 272 + 273 + static inline struct erdma_pd *to_epd(struct ib_pd *pd) 274 + { 275 + return container_of(pd, struct erdma_pd, ibpd); 276 + } 277 + 278 + static inline struct erdma_mr *to_emr(struct ib_mr *ibmr) 279 + { 280 + return container_of(ibmr, struct erdma_mr, ibmr); 281 + } 282 + 283 + static inline struct erdma_qp *to_eqp(struct ib_qp *qp) 284 + { 285 + return container_of(qp, struct erdma_qp, ibqp); 286 + } 287 + 288 + static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) 289 + { 290 + return container_of(ibcq, struct erdma_cq, ibcq); 291 + } 292 + 293 + static inline struct erdma_user_mmap_entry * 294 + to_emmap(struct rdma_user_mmap_entry *ibmmap) 295 + { 296 + return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry); 297 + } 298 + 299 + int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data); 300 + void erdma_dealloc_ucontext(struct ib_ucontext *ibctx); 301 + int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, 302 + struct ib_udata *data); 303 + int erdma_get_port_immutable(struct ib_device *dev, u32 port, 304 + struct ib_port_immutable *ib_port_immutable); 305 + int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, 306 + struct ib_udata *data); 307 + int erdma_query_port(struct ib_device *dev, u32 port, 308 + struct ib_port_attr *attr); 309 + int erdma_query_gid(struct ib_device *dev, u32 port, int idx, 310 + union ib_gid *gid); 311 + int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data); 312 + int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); 313 + int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, 314 + struct ib_udata *data); 315 + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, 316 + struct ib_qp_init_attr *init_attr); 317 + int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, 318 + struct ib_udata *data); 319 + int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); 320 + int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); 321 + int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); 322 + struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, 323 + u64 virt, int access, struct ib_udata *udata); 324 + struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights); 325 + int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data); 326 + int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); 327 + void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry); 328 + void erdma_qp_get_ref(struct ib_qp *ibqp); 329 + void erdma_qp_put_ref(struct ib_qp *ibqp); 330 + struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id); 331 + int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, 332 + const struct ib_send_wr **bad_send_wr); 333 + int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, 334 + const struct ib_recv_wr **bad_recv_wr); 335 + int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); 336 + struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, 337 + u32 max_num_sg); 338 + int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 339 + unsigned int *sg_offset); 340 + void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason); 341 + 342 + #endif
+1 -1
drivers/infiniband/hw/hfi1/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config INFINIBAND_HFI1 3 3 tristate "Cornelis OPX Gen1 support" 4 - depends on X86_64 && INFINIBAND_RDMAVT && I2C 4 + depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML 5 5 select MMU_NOTIFIER 6 6 select CRC32 7 7 select I2C_ALGOBIT
+3 -1
drivers/infiniband/hw/hfi1/file_ops.c
··· 1179 1179 goto done; 1180 1180 1181 1181 ret = init_user_ctxt(fd, uctxt); 1182 - if (ret) 1182 + if (ret) { 1183 + hfi1_free_ctxt_rcv_groups(uctxt); 1183 1184 goto done; 1185 + } 1184 1186 1185 1187 user_init(uctxt); 1186 1188
+1 -3
drivers/infiniband/hw/hfi1/ipoib_tx.c
··· 742 742 kzalloc_node(sizeof(*tx->sdma_hdr), 743 743 GFP_KERNEL, priv->dd->node); 744 744 745 - netif_tx_napi_add(dev, &txq->napi, 746 - hfi1_ipoib_poll_tx_ring, 747 - NAPI_POLL_WEIGHT); 745 + netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring); 748 746 } 749 747 750 748 return 0;
+1 -1
drivers/infiniband/hw/hfi1/netdev_rx.c
··· 216 216 * right now. 217 217 */ 218 218 set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); 219 - netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); 219 + netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); 220 220 rc = msix_netdev_request_rcd_irq(rxq->rcd); 221 221 if (rc) 222 222 goto bail_context_irq_failure;
+1 -1
drivers/infiniband/hw/hfi1/pio_copy.c
··· 172 172 } 173 173 174 174 /* 175 - * Read nbytes from "from" and and place them in the low bytes 175 + * Read nbytes from "from" and place them in the low bytes 176 176 * of pbuf->carry. Other bytes are left as-is. Any previous 177 177 * value in pbuf->carry is lost. 178 178 *
+1
drivers/infiniband/hw/hns/hns_roce_device.h
··· 959 959 const struct hns_roce_hw *hw; 960 960 void *priv; 961 961 struct workqueue_struct *irq_workq; 962 + struct work_struct ecc_work; 962 963 const struct hns_roce_dfx_hw *dfx; 963 964 u32 func_num; 964 965 u32 is_vf;
+217 -37
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
··· 55 55 CMD_RST_PRC_EBUSY, 56 56 }; 57 57 58 + enum ecc_resource_type { 59 + ECC_RESOURCE_QPC, 60 + ECC_RESOURCE_CQC, 61 + ECC_RESOURCE_MPT, 62 + ECC_RESOURCE_SRQC, 63 + ECC_RESOURCE_GMV, 64 + ECC_RESOURCE_QPC_TIMER, 65 + ECC_RESOURCE_CQC_TIMER, 66 + ECC_RESOURCE_SCCC, 67 + ECC_RESOURCE_COUNT, 68 + }; 69 + 70 + static const struct { 71 + const char *name; 72 + u8 read_bt0_op; 73 + u8 write_bt0_op; 74 + } fmea_ram_res[] = { 75 + { "ECC_RESOURCE_QPC", 76 + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, 77 + { "ECC_RESOURCE_CQC", 78 + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, 79 + { "ECC_RESOURCE_MPT", 80 + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, 81 + { "ECC_RESOURCE_SRQC", 82 + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, 83 + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ 84 + { "ECC_RESOURCE_GMV", 85 + 0, 0 }, 86 + { "ECC_RESOURCE_QPC_TIMER", 87 + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, 88 + { "ECC_RESOURCE_CQC_TIMER", 89 + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, 90 + { "ECC_RESOURCE_SCCC", 91 + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, 92 + }; 93 + 58 94 static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, 59 95 struct ib_sge *sg) 60 96 { ··· 5891 5855 !!(eq->cons_index & eq->entries)) ? aeqe : NULL; 5892 5856 } 5893 5857 5894 - static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, 5895 - struct hns_roce_eq *eq) 5858 + static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, 5859 + struct hns_roce_eq *eq) 5896 5860 { 5897 5861 struct device *dev = hr_dev->dev; 5898 5862 struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); 5899 - int aeqe_found = 0; 5863 + irqreturn_t aeqe_found = IRQ_NONE; 5900 5864 int event_type; 5901 5865 u32 queue_num; 5902 5866 int sub_type; ··· 5950 5914 eq->event_type = event_type; 5951 5915 eq->sub_type = sub_type; 5952 5916 ++eq->cons_index; 5953 - aeqe_found = 1; 5917 + aeqe_found = IRQ_HANDLED; 5954 5918 5955 5919 hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); 5956 5920 ··· 5958 5922 } 5959 5923 5960 5924 update_eq_db(eq); 5961 - return aeqe_found; 5925 + 5926 + return IRQ_RETVAL(aeqe_found); 5962 5927 } 5963 5928 5964 5929 static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) ··· 5974 5937 !!(eq->cons_index & eq->entries)) ? ceqe : NULL; 5975 5938 } 5976 5939 5977 - static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, 5978 - struct hns_roce_eq *eq) 5940 + static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, 5941 + struct hns_roce_eq *eq) 5979 5942 { 5980 5943 struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); 5981 - int ceqe_found = 0; 5944 + irqreturn_t ceqe_found = IRQ_NONE; 5982 5945 u32 cqn; 5983 5946 5984 5947 while (ceqe) { ··· 5992 5955 hns_roce_cq_completion(hr_dev, cqn); 5993 5956 5994 5957 ++eq->cons_index; 5995 - ceqe_found = 1; 5958 + ceqe_found = IRQ_HANDLED; 5996 5959 5997 5960 ceqe = next_ceqe_sw_v2(eq); 5998 5961 } 5999 5962 6000 5963 update_eq_db(eq); 6001 5964 6002 - return ceqe_found; 5965 + return IRQ_RETVAL(ceqe_found); 6003 5966 } 6004 5967 6005 5968 static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) 6006 5969 { 6007 5970 struct hns_roce_eq *eq = eq_ptr; 6008 5971 struct hns_roce_dev *hr_dev = eq->hr_dev; 6009 - int int_work; 5972 + irqreturn_t int_work; 6010 5973 6011 5974 if (eq->type_flag == HNS_ROCE_CEQ) 6012 5975 /* Completion event interrupt */ ··· 6018 5981 return IRQ_RETVAL(int_work); 6019 5982 } 6020 5983 6021 - static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) 5984 + static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, 5985 + u32 int_st) 6022 5986 { 6023 - struct hns_roce_dev *hr_dev = dev_id; 6024 - struct device *dev = hr_dev->dev; 6025 - int int_work = 0; 6026 - u32 int_st; 5987 + struct pci_dev *pdev = hr_dev->pci_dev; 5988 + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); 5989 + const struct hnae3_ae_ops *ops = ae_dev->ops; 5990 + irqreturn_t int_work = IRQ_NONE; 6027 5991 u32 int_en; 6028 5992 6029 - /* Abnormal interrupt */ 6030 - int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); 6031 5993 int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); 6032 5994 6033 5995 if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { 6034 - struct pci_dev *pdev = hr_dev->pci_dev; 6035 - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); 6036 - const struct hnae3_ae_ops *ops = ae_dev->ops; 5996 + dev_err(hr_dev->dev, "AEQ overflow!\n"); 6037 5997 6038 - dev_err(dev, "AEQ overflow!\n"); 6039 - 6040 - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S; 6041 - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); 5998 + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 5999 + 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); 6042 6000 6043 6001 /* Set reset level for reset_event() */ 6044 6002 if (ops->set_default_reset_request) ··· 6045 6013 int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; 6046 6014 roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); 6047 6015 6048 - int_work = 1; 6049 - } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) { 6050 - dev_err(dev, "RAS interrupt!\n"); 6051 - 6052 - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S; 6053 - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); 6054 - 6055 - int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; 6056 - roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); 6057 - 6058 - int_work = 1; 6016 + int_work = IRQ_HANDLED; 6059 6017 } else { 6060 - dev_err(dev, "There is no abnormal irq found!\n"); 6018 + dev_err(hr_dev->dev, "there is no basic abn irq found.\n"); 6019 + } 6020 + 6021 + return IRQ_RETVAL(int_work); 6022 + } 6023 + 6024 + static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, 6025 + struct fmea_ram_ecc *ecc_info) 6026 + { 6027 + struct hns_roce_cmq_desc desc; 6028 + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; 6029 + int ret; 6030 + 6031 + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); 6032 + ret = hns_roce_cmq_send(hr_dev, &desc, 1); 6033 + if (ret) 6034 + return ret; 6035 + 6036 + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); 6037 + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); 6038 + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); 6039 + 6040 + return 0; 6041 + } 6042 + 6043 + static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) 6044 + { 6045 + struct hns_roce_cmq_desc desc; 6046 + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; 6047 + u32 addr_upper; 6048 + u32 addr_low; 6049 + int ret; 6050 + 6051 + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); 6052 + hr_reg_write(req, CFG_GMV_BT_IDX, idx); 6053 + 6054 + ret = hns_roce_cmq_send(hr_dev, &desc, 1); 6055 + if (ret) { 6056 + dev_err(hr_dev->dev, 6057 + "failed to execute cmd to read gmv, ret = %d.\n", ret); 6058 + return ret; 6059 + } 6060 + 6061 + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); 6062 + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); 6063 + 6064 + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); 6065 + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); 6066 + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); 6067 + hr_reg_write(req, CFG_GMV_BT_IDX, idx); 6068 + 6069 + return hns_roce_cmq_send(hr_dev, &desc, 1); 6070 + } 6071 + 6072 + static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) 6073 + { 6074 + if (res_type == ECC_RESOURCE_QPC_TIMER || 6075 + res_type == ECC_RESOURCE_CQC_TIMER || 6076 + res_type == ECC_RESOURCE_SCCC) 6077 + return le64_to_cpu(*data); 6078 + 6079 + return le64_to_cpu(*data) << PAGE_SHIFT; 6080 + } 6081 + 6082 + static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, 6083 + u32 index) 6084 + { 6085 + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; 6086 + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; 6087 + struct hns_roce_cmd_mailbox *mailbox; 6088 + u64 addr; 6089 + int ret; 6090 + 6091 + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); 6092 + if (IS_ERR(mailbox)) 6093 + return PTR_ERR(mailbox); 6094 + 6095 + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); 6096 + if (ret) { 6097 + dev_err(hr_dev->dev, 6098 + "failed to execute cmd to read fmea ram, ret = %d.\n", 6099 + ret); 6100 + goto out; 6101 + } 6102 + 6103 + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); 6104 + 6105 + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); 6106 + if (ret) 6107 + dev_err(hr_dev->dev, 6108 + "failed to execute cmd to write fmea ram, ret = %d.\n", 6109 + ret); 6110 + 6111 + out: 6112 + hns_roce_free_cmd_mailbox(hr_dev, mailbox); 6113 + return ret; 6114 + } 6115 + 6116 + static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, 6117 + struct fmea_ram_ecc *ecc_info) 6118 + { 6119 + u32 res_type = ecc_info->res_type; 6120 + u32 index = ecc_info->index; 6121 + int ret; 6122 + 6123 + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); 6124 + 6125 + if (res_type >= ECC_RESOURCE_COUNT) { 6126 + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", 6127 + res_type); 6128 + return; 6129 + } 6130 + 6131 + if (res_type == ECC_RESOURCE_GMV) 6132 + ret = fmea_recover_gmv(hr_dev, index); 6133 + else 6134 + ret = fmea_recover_others(hr_dev, res_type, index); 6135 + if (ret) 6136 + dev_err(hr_dev->dev, 6137 + "failed to recover %s, index = %u, ret = %d.\n", 6138 + fmea_ram_res[res_type].name, index, ret); 6139 + } 6140 + 6141 + static void fmea_ram_ecc_work(struct work_struct *ecc_work) 6142 + { 6143 + struct hns_roce_dev *hr_dev = 6144 + container_of(ecc_work, struct hns_roce_dev, ecc_work); 6145 + struct fmea_ram_ecc ecc_info = {}; 6146 + 6147 + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { 6148 + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); 6149 + return; 6150 + } 6151 + 6152 + if (!ecc_info.is_ecc_err) { 6153 + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); 6154 + return; 6155 + } 6156 + 6157 + fmea_ram_ecc_recover(hr_dev, &ecc_info); 6158 + } 6159 + 6160 + static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) 6161 + { 6162 + struct hns_roce_dev *hr_dev = dev_id; 6163 + irqreturn_t int_work = IRQ_NONE; 6164 + u32 int_st; 6165 + 6166 + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); 6167 + 6168 + if (int_st) { 6169 + int_work = abnormal_interrupt_basic(hr_dev, int_st); 6170 + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { 6171 + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); 6172 + int_work = IRQ_HANDLED; 6173 + } else { 6174 + dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); 6061 6175 } 6062 6176 6063 6177 return IRQ_RETVAL(int_work); ··· 6519 6341 goto err_create_eq_fail; 6520 6342 } 6521 6343 } 6344 + 6345 + INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); 6522 6346 6523 6347 hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); 6524 6348 if (!hr_dev->irq_workq) {
+12 -1
drivers/infiniband/hw/hns/hns_roce_hw_v2.h
··· 250 250 HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, 251 251 HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, 252 252 HNS_ROCE_OPC_EXT_CFG = 0x8512, 253 + HNS_ROCE_QUERY_RAM_ECC = 0x8513, 253 254 HNS_SWITCH_PARAMETER_CFG = 0x1033, 254 255 }; 255 256 ··· 1108 1107 #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) 1109 1108 #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64) 1110 1109 1110 + /* Fields of HNS_ROCE_QUERY_RAM_ECC */ 1111 + #define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) 1112 + #define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) 1113 + #define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) 1114 + 1111 1115 struct hns_roce_cfg_sgid_tb { 1112 1116 __le32 table_idx_rsv; 1113 1117 __le32 vf_sgid_l; ··· 1349 1343 struct list_head node; /* all dips are on a list */ 1350 1344 }; 1351 1345 1346 + struct fmea_ram_ecc { 1347 + u32 is_ecc_err; 1348 + u32 res_type; 1349 + u32 index; 1350 + }; 1351 + 1352 1352 /* only for RNR timeout issue of HIP08 */ 1353 1353 #define HNS_ROCE_CLOCK_ADJUST 1000 1354 1354 #define HNS_ROCE_MAX_CQ_PERIOD 65 ··· 1394 1382 #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 1395 1383 1396 1384 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 1397 - #define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1 1398 1385 1399 1386 #define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 1400 1387 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1
+6 -5
drivers/infiniband/hw/irdma/cm.c
··· 1477 1477 list_for_each_entry (listen_node, &cm_core->listen_list, list) { 1478 1478 memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); 1479 1479 listen_port = listen_node->loc_port; 1480 + if (listen_port != dst_port || 1481 + !(listener_state & listen_node->listener_state)) 1482 + continue; 1480 1483 /* compare node pair, return node handle if a match */ 1481 - if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) || 1482 - !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) && 1483 - listen_port == dst_port && 1484 - vlan_id == listen_node->vlan_id && 1485 - (listener_state & listen_node->listener_state)) { 1484 + if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) || 1485 + (!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) && 1486 + vlan_id == listen_node->vlan_id)) { 1486 1487 refcount_inc(&listen_node->refcnt); 1487 1488 spin_unlock_irqrestore(&cm_core->listen_list_lock, 1488 1489 flags);
+5 -3
drivers/infiniband/hw/irdma/ctrl.c
··· 4872 4872 4873 4873 sd_diff = sd_needed - hmc_fpm_misc->max_sds; 4874 4874 if (sd_diff > 128) { 4875 - if (qpwanted > 128 && sd_diff > 144) 4875 + if (!(loop_count % 2) && qpwanted > 128) { 4876 4876 qpwanted /= 2; 4877 - mrwanted /= 2; 4878 - pblewanted /= 2; 4877 + } else { 4878 + mrwanted /= 2; 4879 + pblewanted /= 2; 4880 + } 4879 4881 continue; 4880 4882 } 4881 4883 if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF &&
+11 -22
drivers/infiniband/hw/irdma/hw.c
··· 257 257 iwqp->last_aeq = info->ae_id; 258 258 spin_unlock_irqrestore(&iwqp->lock, flags); 259 259 ctx_info = &iwqp->ctx_info; 260 - if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) 261 - ctx_info->roce_info->err_rq_idx_valid = true; 262 - else 263 - ctx_info->iwarp_info->err_rq_idx_valid = true; 264 260 } else { 265 261 if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR) 266 262 continue; ··· 366 370 case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: 367 371 case IRDMA_AE_LCE_CQ_CATASTROPHIC: 368 372 case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: 369 - if (rdma_protocol_roce(&iwdev->ibdev, 1)) 370 - ctx_info->roce_info->err_rq_idx_valid = false; 371 - else 372 - ctx_info->iwarp_info->err_rq_idx_valid = false; 373 - fallthrough; 374 373 default: 375 - ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n", 376 - info->ae_id, info->qp, info->qp_cq_id); 374 + ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n", 375 + info->ae_id, info->qp, info->qp_cq_id, info->ae_src); 377 376 if (rdma_protocol_roce(&iwdev->ibdev, 1)) { 378 - if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) { 377 + ctx_info->roce_info->err_rq_idx_valid = info->rq; 378 + if (info->rq) { 379 379 ctx_info->roce_info->err_rq_idx = info->wqe_idx; 380 380 irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va, 381 381 ctx_info); ··· 380 388 irdma_cm_disconn(iwqp); 381 389 break; 382 390 } 383 - if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) { 391 + ctx_info->iwarp_info->err_rq_idx_valid = info->rq; 392 + if (info->rq) { 384 393 ctx_info->iwarp_info->err_rq_idx = info->wqe_idx; 385 394 ctx_info->tcp_info_valid = false; 386 395 ctx_info->iwarp_info_valid = true; ··· 1505 1512 int status; 1506 1513 u32 qpcnt; 1507 1514 1508 - if (rf->rdma_ver == IRDMA_GEN_1) 1509 - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2; 1510 - else 1511 - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; 1515 + qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; 1512 1516 1513 1517 rf->sd_type = IRDMA_SD_TYPE_DIRECT; 1514 1518 status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt); ··· 1533 1543 rf->obj_mem.pa); 1534 1544 rf->obj_mem.va = NULL; 1535 1545 if (rf->rdma_ver != IRDMA_GEN_1) { 1536 - kfree(rf->allocated_ws_nodes); 1546 + bitmap_free(rf->allocated_ws_nodes); 1537 1547 rf->allocated_ws_nodes = NULL; 1538 1548 } 1539 1549 kfree(rf->ceqlist); ··· 1962 1972 u32 ret; 1963 1973 1964 1974 if (rf->rdma_ver != IRDMA_GEN_1) { 1965 - rf->allocated_ws_nodes = 1966 - kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES), 1967 - sizeof(unsigned long), GFP_KERNEL); 1975 + rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES, 1976 + GFP_KERNEL); 1968 1977 if (!rf->allocated_ws_nodes) 1969 1978 return -ENOMEM; 1970 1979 ··· 2012 2023 return 0; 2013 2024 2014 2025 mem_rsrc_kzalloc_fail: 2015 - kfree(rf->allocated_ws_nodes); 2026 + bitmap_free(rf->allocated_ws_nodes); 2016 2027 rf->allocated_ws_nodes = NULL; 2017 2028 2018 2029 return ret;
+1 -1
drivers/infiniband/hw/irdma/main.h
··· 85 85 #define IRDMA_NO_QSET 0xffff 86 86 87 87 #define IW_CFG_FPM_QP_COUNT 32768 88 - #define IRDMA_MAX_PAGES_PER_FMR 512 88 + #define IRDMA_MAX_PAGES_PER_FMR 262144 89 89 #define IRDMA_MIN_PAGES_PER_FMR 1 90 90 #define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED 2 91 91 #define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED 3
+1
drivers/infiniband/hw/irdma/utils.c
··· 652 652 }; 653 653 654 654 static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { 655 + {0xffff, 0x8002, "Invalid State"}, 655 656 {0xffff, 0x8006, "Flush No Wqe Pending"}, 656 657 {0xffff, 0x8007, "Modify QP Bad Close"}, 657 658 {0xffff, 0x8009, "LLP Closed"},
+12 -4
drivers/infiniband/hw/irdma/verbs.c
··· 1776 1776 spin_unlock_irqrestore(&iwcq->lock, flags); 1777 1777 1778 1778 irdma_cq_wq_destroy(iwdev->rf, cq); 1779 - irdma_cq_free_rsrc(iwdev->rf, iwcq); 1780 1779 1781 1780 spin_lock_irqsave(&iwceq->ce_lock, flags); 1782 1781 irdma_sc_cleanup_ceqes(cq, ceq); 1783 1782 spin_unlock_irqrestore(&iwceq->ce_lock, flags); 1783 + irdma_cq_free_rsrc(iwdev->rf, iwcq); 1784 1784 1785 1785 return 0; 1786 1786 } ··· 2605 2605 palloc = &iwpbl->pble_alloc; 2606 2606 iwmr->page_cnt = max_num_sg; 2607 2607 err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, 2608 - true); 2608 + false); 2609 2609 if (err_code) 2610 2610 goto err_get_pble; 2611 2611 ··· 2641 2641 if (unlikely(iwmr->npages == iwmr->page_cnt)) 2642 2642 return -ENOMEM; 2643 2643 2644 - pbl = palloc->level1.addr; 2645 - pbl[iwmr->npages++] = addr; 2644 + if (palloc->level == PBLE_LEVEL_2) { 2645 + struct irdma_pble_info *palloc_info = 2646 + palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT); 2647 + 2648 + palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr; 2649 + } else { 2650 + pbl = palloc->level1.addr; 2651 + pbl[iwmr->npages] = addr; 2652 + } 2653 + iwmr->npages++; 2646 2654 2647 2655 return 0; 2648 2656 }
+4
drivers/infiniband/hw/mlx5/cq.c
··· 523 523 "Requestor" : "Responder", cq->mcq.cqn); 524 524 mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", 525 525 err_cqe->syndrome, err_cqe->vendor_err_synd); 526 + if (wc->status != IB_WC_WR_FLUSH_ERR && 527 + (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) 528 + dev->umrc.state = MLX5_UMR_STATE_RECOVER; 529 + 526 530 if (opcode == MLX5_CQE_REQ_ERR) { 527 531 wq = &(*cur_qp)->sq; 528 532 wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+146 -19
drivers/infiniband/hw/mlx5/fs.c
··· 679 679 #define MLX5_FS_MAX_TYPES 6 680 680 #define MLX5_FS_MAX_ENTRIES BIT(16) 681 681 682 - static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, 682 + static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) 683 + { 684 + struct mlx5_ib_dev *dev = to_mdev(device); 685 + 686 + return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); 687 + } 688 + 689 + static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, 690 + struct mlx5_flow_namespace *ns, 683 691 struct mlx5_ib_flow_prio *prio, 684 692 int priority, 685 693 int num_entries, int num_groups, ··· 696 688 struct mlx5_flow_table_attr ft_attr = {}; 697 689 struct mlx5_flow_table *ft; 698 690 691 + if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) 692 + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; 699 693 ft_attr.prio = priority; 700 694 ft_attr.max_fte = num_entries; 701 695 ft_attr.flags = flags; ··· 794 784 795 785 ft = prio->flow_table; 796 786 if (!ft) 797 - return _get_prio(ns, prio, priority, max_table_size, num_groups, 798 - flags); 787 + return _get_prio(dev, ns, prio, priority, max_table_size, 788 + num_groups, flags); 799 789 800 790 return prio; 801 791 } ··· 937 927 938 928 prio = &dev->flow_db->opfcs[type]; 939 929 if (!prio->flow_table) { 940 - prio = _get_prio(ns, prio, priority, 930 + prio = _get_prio(dev, ns, prio, priority, 941 931 dev->num_ports * MAX_OPFC_RULES, 1, 0); 942 932 if (IS_ERR(prio)) { 943 933 err = PTR_ERR(prio); ··· 1417 1407 } 1418 1408 1419 1409 static struct mlx5_ib_flow_prio * 1420 - _get_flow_table(struct mlx5_ib_dev *dev, 1421 - struct mlx5_ib_flow_matcher *fs_matcher, 1410 + _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, 1411 + enum mlx5_flow_namespace_type ns_type, 1422 1412 bool mcast) 1423 1413 { 1424 1414 struct mlx5_flow_namespace *ns = NULL; ··· 1431 1421 if (mcast) 1432 1422 priority = MLX5_IB_FLOW_MCAST_PRIO; 1433 1423 else 1434 - priority = ib_prio_to_core_prio(fs_matcher->priority, false); 1424 + priority = ib_prio_to_core_prio(user_priority, false); 1435 1425 1436 1426 esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != 1437 1427 DEVLINK_ESWITCH_ENCAP_MODE_NONE; 1438 - switch (fs_matcher->ns_type) { 1428 + switch (ns_type) { 1439 1429 case MLX5_FLOW_NAMESPACE_BYPASS: 1440 1430 max_table_size = BIT( 1441 1431 MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); ··· 1462 1452 reformat_l3_tunnel_to_l2) && 1463 1453 esw_encap) 1464 1454 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; 1465 - priority = fs_matcher->priority; 1455 + priority = user_priority; 1466 1456 break; 1467 1457 case MLX5_FLOW_NAMESPACE_RDMA_RX: 1468 1458 max_table_size = BIT( 1469 1459 MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size)); 1470 - priority = fs_matcher->priority; 1460 + priority = user_priority; 1471 1461 break; 1472 1462 case MLX5_FLOW_NAMESPACE_RDMA_TX: 1473 1463 max_table_size = BIT( 1474 1464 MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); 1475 - priority = fs_matcher->priority; 1465 + priority = user_priority; 1476 1466 break; 1477 1467 default: 1478 1468 break; ··· 1480 1470 1481 1471 max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); 1482 1472 1483 - ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); 1473 + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); 1484 1474 if (!ns) 1485 1475 return ERR_PTR(-EOPNOTSUPP); 1486 1476 1487 - switch (fs_matcher->ns_type) { 1477 + switch (ns_type) { 1488 1478 case MLX5_FLOW_NAMESPACE_BYPASS: 1489 1479 prio = &dev->flow_db->prios[priority]; 1490 1480 break; ··· 1509 1499 if (prio->flow_table) 1510 1500 return prio; 1511 1501 1512 - return _get_prio(ns, prio, priority, max_table_size, 1502 + return _get_prio(dev, ns, prio, priority, max_table_size, 1513 1503 MLX5_FS_MAX_TYPES, flags); 1514 1504 } 1515 1505 ··· 1628 1618 mcast = raw_fs_is_multicast(fs_matcher, cmd_in); 1629 1619 mutex_lock(&dev->flow_db->lock); 1630 1620 1631 - ft_prio = _get_flow_table(dev, fs_matcher, mcast); 1621 + ft_prio = _get_flow_table(dev, fs_matcher->priority, 1622 + fs_matcher->ns_type, mcast); 1632 1623 if (IS_ERR(ft_prio)) { 1633 1624 err = PTR_ERR(ft_prio); 1634 1625 goto unlock; ··· 2026 2015 return 0; 2027 2016 } 2028 2017 2018 + static int steering_anchor_cleanup(struct ib_uobject *uobject, 2019 + enum rdma_remove_reason why, 2020 + struct uverbs_attr_bundle *attrs) 2021 + { 2022 + struct mlx5_ib_steering_anchor *obj = uobject->object; 2023 + 2024 + if (atomic_read(&obj->usecnt)) 2025 + return -EBUSY; 2026 + 2027 + mutex_lock(&obj->dev->flow_db->lock); 2028 + put_flow_table(obj->dev, obj->ft_prio, true); 2029 + mutex_unlock(&obj->dev->flow_db->lock); 2030 + 2031 + kfree(obj); 2032 + return 0; 2033 + } 2034 + 2029 2035 static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, 2030 2036 struct mlx5_ib_flow_matcher *obj) 2031 2037 { ··· 2078 2050 if (err) 2079 2051 return err; 2080 2052 2081 - if (flags) { 2082 - mlx5_ib_ft_type_to_namespace( 2053 + if (flags) 2054 + return mlx5_ib_ft_type_to_namespace( 2083 2055 MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, 2084 2056 &obj->ns_type); 2085 - return 0; 2086 - } 2087 2057 } 2088 2058 2089 2059 obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS; ··· 2144 2118 2145 2119 end: 2146 2120 kfree(obj); 2121 + return err; 2122 + } 2123 + 2124 + static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( 2125 + struct uverbs_attr_bundle *attrs) 2126 + { 2127 + struct ib_uobject *uobj = uverbs_attr_get_uobject( 2128 + attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE); 2129 + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); 2130 + enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type; 2131 + enum mlx5_flow_namespace_type ns_type; 2132 + struct mlx5_ib_steering_anchor *obj; 2133 + struct mlx5_ib_flow_prio *ft_prio; 2134 + u16 priority; 2135 + u32 ft_id; 2136 + int err; 2137 + 2138 + if (!capable(CAP_NET_RAW)) 2139 + return -EPERM; 2140 + 2141 + err = uverbs_get_const(&ib_uapi_ft_type, attrs, 2142 + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE); 2143 + if (err) 2144 + return err; 2145 + 2146 + err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type); 2147 + if (err) 2148 + return err; 2149 + 2150 + err = uverbs_copy_from(&priority, attrs, 2151 + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY); 2152 + if (err) 2153 + return err; 2154 + 2155 + obj = kzalloc(sizeof(*obj), GFP_KERNEL); 2156 + if (!obj) 2157 + return -ENOMEM; 2158 + 2159 + mutex_lock(&dev->flow_db->lock); 2160 + ft_prio = _get_flow_table(dev, priority, ns_type, 0); 2161 + if (IS_ERR(ft_prio)) { 2162 + mutex_unlock(&dev->flow_db->lock); 2163 + err = PTR_ERR(ft_prio); 2164 + goto free_obj; 2165 + } 2166 + 2167 + ft_prio->refcount++; 2168 + ft_id = mlx5_flow_table_id(ft_prio->flow_table); 2169 + mutex_unlock(&dev->flow_db->lock); 2170 + 2171 + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, 2172 + &ft_id, sizeof(ft_id)); 2173 + if (err) 2174 + goto put_flow_table; 2175 + 2176 + uobj->object = obj; 2177 + obj->dev = dev; 2178 + obj->ft_prio = ft_prio; 2179 + atomic_set(&obj->usecnt, 0); 2180 + 2181 + return 0; 2182 + 2183 + put_flow_table: 2184 + mutex_lock(&dev->flow_db->lock); 2185 + put_flow_table(dev, ft_prio, true); 2186 + mutex_unlock(&dev->flow_db->lock); 2187 + free_obj: 2188 + kfree(obj); 2189 + 2147 2190 return err; 2148 2191 } 2149 2192 ··· 2572 2477 &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), 2573 2478 &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); 2574 2479 2480 + DECLARE_UVERBS_NAMED_METHOD( 2481 + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE, 2482 + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE, 2483 + MLX5_IB_OBJECT_STEERING_ANCHOR, 2484 + UVERBS_ACCESS_NEW, 2485 + UA_MANDATORY), 2486 + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, 2487 + enum mlx5_ib_uapi_flow_table_type, 2488 + UA_MANDATORY), 2489 + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, 2490 + UVERBS_ATTR_TYPE(u16), 2491 + UA_MANDATORY), 2492 + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, 2493 + UVERBS_ATTR_TYPE(u32), 2494 + UA_MANDATORY)); 2495 + 2496 + DECLARE_UVERBS_NAMED_METHOD_DESTROY( 2497 + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, 2498 + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE, 2499 + MLX5_IB_OBJECT_STEERING_ANCHOR, 2500 + UVERBS_ACCESS_DESTROY, 2501 + UA_MANDATORY)); 2502 + 2503 + DECLARE_UVERBS_NAMED_OBJECT( 2504 + MLX5_IB_OBJECT_STEERING_ANCHOR, 2505 + UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup), 2506 + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE), 2507 + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); 2508 + 2575 2509 const struct uapi_definition mlx5_ib_flow_defs[] = { 2576 2510 UAPI_DEF_CHAIN_OBJ_TREE_NAMED( 2577 2511 MLX5_IB_OBJECT_FLOW_MATCHER), ··· 2609 2485 &mlx5_ib_fs), 2610 2486 UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, 2611 2487 &mlx5_ib_flow_actions), 2488 + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( 2489 + MLX5_IB_OBJECT_STEERING_ANCHOR, 2490 + UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), 2612 2491 {}, 2613 2492 }; 2614 2493
+2 -2
drivers/infiniband/hw/mlx5/main.c
··· 4002 4002 { 4003 4003 int err; 4004 4004 4005 - err = mlx5_mr_cache_cleanup(dev); 4005 + err = mlx5_mkey_cache_cleanup(dev); 4006 4006 if (err) 4007 4007 mlx5_ib_warn(dev, "mr cache cleanup failed\n"); 4008 4008 ··· 4022 4022 if (ret) 4023 4023 return ret; 4024 4024 4025 - ret = mlx5_mr_cache_init(dev); 4025 + ret = mlx5_mkey_cache_init(dev); 4026 4026 if (ret) { 4027 4027 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); 4028 4028 mlx5r_umr_resource_cleanup(dev);
+40 -39
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 259 259 u8 match_criteria_enable; 260 260 }; 261 261 262 + struct mlx5_ib_steering_anchor { 263 + struct mlx5_ib_flow_prio *ft_prio; 264 + struct mlx5_ib_dev *dev; 265 + atomic_t usecnt; 266 + }; 267 + 262 268 struct mlx5_ib_pp { 263 269 u16 index; 264 270 struct mlx5_core_dev *mdev; ··· 619 613 unsigned int ndescs; 620 614 struct wait_queue_head wait; 621 615 refcount_t usecount; 616 + struct mlx5_cache_ent *cache_ent; 622 617 }; 623 618 624 619 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) ··· 642 635 struct ib_mr ibmr; 643 636 struct mlx5_ib_mkey mmkey; 644 637 645 - /* User MR data */ 646 - struct mlx5_cache_ent *cache_ent; 647 - /* Everything after cache_ent is zero'd when MR allocated */ 648 638 struct ib_umem *umem; 649 639 650 640 union { 651 - /* Used only while the MR is in the cache */ 652 - struct { 653 - u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; 654 - struct mlx5_async_work cb_work; 655 - /* Cache list element */ 656 - struct list_head list; 657 - }; 658 - 659 641 /* Used only by kernel MRs (umem == NULL) */ 660 642 struct { 661 643 void *descs; ··· 684 688 }; 685 689 }; 686 690 687 - /* Zero the fields in the mr that are variant depending on usage */ 688 - static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr) 689 - { 690 - memset_after(mr, 0, cache_ent); 691 - } 692 - 693 691 static inline bool is_odp_mr(struct mlx5_ib_mr *mr) 694 692 { 695 693 return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && ··· 707 717 struct completion done; 708 718 }; 709 719 720 + enum { 721 + MLX5_UMR_STATE_ACTIVE, 722 + MLX5_UMR_STATE_RECOVER, 723 + MLX5_UMR_STATE_ERR, 724 + }; 725 + 710 726 struct umr_common { 711 727 struct ib_pd *pd; 712 728 struct ib_cq *cq; 713 729 struct ib_qp *qp; 714 - /* control access to UMR QP 730 + /* Protects from UMR QP overflow 715 731 */ 716 732 struct semaphore sem; 733 + /* Protects from using UMR while the UMR is not active 734 + */ 735 + struct mutex lock; 736 + unsigned int state; 717 737 }; 718 738 719 739 struct mlx5_cache_ent { 720 - struct list_head head; 721 - /* sync access to the cahce entry 722 - */ 723 - spinlock_t lock; 724 - 740 + struct xarray mkeys; 741 + unsigned long stored; 742 + unsigned long reserved; 725 743 726 744 char name[4]; 727 745 u32 order; ··· 741 743 u8 fill_to_high_water:1; 742 744 743 745 /* 744 - * - available_mrs is the length of list head, ie the number of MRs 745 - * available for immediate allocation. 746 - * - total_mrs is available_mrs plus all in use MRs that could be 747 - * returned to the cache. 748 - * - limit is the low water mark for available_mrs, 2* limit is the 746 + * - limit is the low water mark for stored mkeys, 2* limit is the 749 747 * upper water mark. 750 - * - pending is the number of MRs currently being created 751 748 */ 752 - u32 total_mrs; 753 - u32 available_mrs; 749 + u32 in_use; 754 750 u32 limit; 755 - u32 pending; 756 751 757 752 /* Statistics */ 758 753 u32 miss; ··· 754 763 struct delayed_work dwork; 755 764 }; 756 765 757 - struct mlx5_mr_cache { 766 + struct mlx5r_async_create_mkey { 767 + union { 768 + u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)]; 769 + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; 770 + }; 771 + struct mlx5_async_work cb_work; 772 + struct mlx5_cache_ent *ent; 773 + u32 mkey; 774 + }; 775 + 776 + struct mlx5_mkey_cache { 758 777 struct workqueue_struct *wq; 759 - struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; 778 + struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; 760 779 struct dentry *root; 761 780 unsigned long last_add; 762 781 }; ··· 1065 1064 struct mlx5_ib_resources devr; 1066 1065 1067 1066 atomic_t mkey_var; 1068 - struct mlx5_mr_cache cache; 1067 + struct mlx5_mkey_cache cache; 1069 1068 struct timer_list delay_timer; 1070 1069 /* Prevents soft lock on massive reg MRs */ 1071 1070 struct mutex slow_path_mutex; ··· 1310 1309 u64 access_flags); 1311 1310 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); 1312 1311 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); 1313 - int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); 1314 - int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); 1312 + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); 1313 + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); 1315 1314 1316 1315 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 1317 1316 struct mlx5_cache_ent *ent, ··· 1339 1338 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); 1340 1339 int __init mlx5_ib_odp_init(void); 1341 1340 void mlx5_ib_odp_cleanup(void); 1342 - void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); 1341 + void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent); 1343 1342 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1344 1343 struct mlx5_ib_mr *mr, int flags); 1345 1344 ··· 1358 1357 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} 1359 1358 static inline int mlx5_ib_odp_init(void) { return 0; } 1360 1359 static inline void mlx5_ib_odp_cleanup(void) {} 1361 - static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} 1360 + static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {} 1362 1361 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1363 1362 struct mlx5_ib_mr *mr, int flags) {} 1364 1363
+278 -236
drivers/infiniband/hw/mlx5/mr.c
··· 82 82 MLX5_SET64(mkc, mkc, start_addr, start_addr); 83 83 } 84 84 85 - static void assign_mkey_variant(struct mlx5_ib_dev *dev, 86 - struct mlx5_ib_mkey *mkey, u32 *in) 85 + static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 87 86 { 88 87 u8 key = atomic_inc_return(&dev->mkey_var); 89 88 void *mkc; 90 89 91 90 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 92 91 MLX5_SET(mkc, mkc, mkey_7_0, key); 93 - mkey->key = key; 92 + *mkey = key; 94 93 } 95 94 96 95 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, ··· 97 98 { 98 99 int ret; 99 100 100 - assign_mkey_variant(dev, mkey, in); 101 + assign_mkey_variant(dev, &mkey->key, in); 101 102 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 102 103 if (!ret) 103 104 init_waitqueue_head(&mkey->wait); ··· 105 106 return ret; 106 107 } 107 108 108 - static int 109 - mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 110 - struct mlx5_ib_mkey *mkey, 111 - struct mlx5_async_ctx *async_ctx, 112 - u32 *in, int inlen, u32 *out, int outlen, 113 - struct mlx5_async_work *context) 109 + static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 114 110 { 115 - MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 116 - assign_mkey_variant(dev, mkey, in); 117 - return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 118 - create_mkey_callback, context); 111 + struct mlx5_ib_dev *dev = async_create->ent->dev; 112 + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 113 + size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 114 + 115 + MLX5_SET(create_mkey_in, async_create->in, opcode, 116 + MLX5_CMD_OP_CREATE_MKEY); 117 + assign_mkey_variant(dev, &async_create->mkey, async_create->in); 118 + return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 119 + async_create->out, outlen, create_mkey_callback, 120 + &async_create->cb_work); 119 121 } 120 122 121 - static int mr_cache_max_order(struct mlx5_ib_dev *dev); 123 + static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 122 124 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 123 125 124 126 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) ··· 142 142 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 143 143 } 144 144 145 + 146 + static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, 147 + void *to_store) 148 + { 149 + XA_STATE(xas, &ent->mkeys, 0); 150 + void *curr; 151 + 152 + xa_lock_irq(&ent->mkeys); 153 + if (limit_pendings && 154 + (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { 155 + xa_unlock_irq(&ent->mkeys); 156 + return -EAGAIN; 157 + } 158 + while (1) { 159 + /* 160 + * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version 161 + * doesn't transparently unlock. Instead we set the xas index to 162 + * the current value of reserved every iteration. 163 + */ 164 + xas_set(&xas, ent->reserved); 165 + curr = xas_load(&xas); 166 + if (!curr) { 167 + if (to_store && ent->stored == ent->reserved) 168 + xas_store(&xas, to_store); 169 + else 170 + xas_store(&xas, XA_ZERO_ENTRY); 171 + if (xas_valid(&xas)) { 172 + ent->reserved++; 173 + if (to_store) { 174 + if (ent->stored != ent->reserved) 175 + __xa_store(&ent->mkeys, 176 + ent->stored, 177 + to_store, 178 + GFP_KERNEL); 179 + ent->stored++; 180 + queue_adjust_cache_locked(ent); 181 + WRITE_ONCE(ent->dev->cache.last_add, 182 + jiffies); 183 + } 184 + } 185 + } 186 + xa_unlock_irq(&ent->mkeys); 187 + 188 + /* 189 + * Notice xas_nomem() must always be called as it cleans 190 + * up any cached allocation. 191 + */ 192 + if (!xas_nomem(&xas, GFP_KERNEL)) 193 + break; 194 + xa_lock_irq(&ent->mkeys); 195 + } 196 + if (xas_error(&xas)) 197 + return xas_error(&xas); 198 + if (WARN_ON(curr)) 199 + return -EINVAL; 200 + return 0; 201 + } 202 + 203 + static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) 204 + { 205 + void *old; 206 + 207 + ent->reserved--; 208 + old = __xa_erase(&ent->mkeys, ent->reserved); 209 + WARN_ON(old); 210 + } 211 + 212 + static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) 213 + { 214 + void *old; 215 + 216 + old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); 217 + WARN_ON(old); 218 + ent->stored++; 219 + } 220 + 221 + static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) 222 + { 223 + void *old, *xa_mkey; 224 + 225 + ent->stored--; 226 + ent->reserved--; 227 + 228 + if (ent->stored == ent->reserved) { 229 + xa_mkey = __xa_erase(&ent->mkeys, ent->stored); 230 + WARN_ON(!xa_mkey); 231 + return (u32)xa_to_value(xa_mkey); 232 + } 233 + 234 + xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, 235 + GFP_KERNEL); 236 + WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); 237 + old = __xa_erase(&ent->mkeys, ent->reserved); 238 + WARN_ON(old); 239 + return (u32)xa_to_value(xa_mkey); 240 + } 241 + 145 242 static void create_mkey_callback(int status, struct mlx5_async_work *context) 146 243 { 147 - struct mlx5_ib_mr *mr = 148 - container_of(context, struct mlx5_ib_mr, cb_work); 149 - struct mlx5_cache_ent *ent = mr->cache_ent; 244 + struct mlx5r_async_create_mkey *mkey_out = 245 + container_of(context, struct mlx5r_async_create_mkey, cb_work); 246 + struct mlx5_cache_ent *ent = mkey_out->ent; 150 247 struct mlx5_ib_dev *dev = ent->dev; 151 248 unsigned long flags; 152 249 153 250 if (status) { 154 - create_mkey_warn(dev, status, mr->out); 155 - kfree(mr); 156 - spin_lock_irqsave(&ent->lock, flags); 157 - ent->pending--; 251 + create_mkey_warn(dev, status, mkey_out->out); 252 + kfree(mkey_out); 253 + xa_lock_irqsave(&ent->mkeys, flags); 254 + undo_push_reserve_mkey(ent); 158 255 WRITE_ONCE(dev->fill_delay, 1); 159 - spin_unlock_irqrestore(&ent->lock, flags); 256 + xa_unlock_irqrestore(&ent->mkeys, flags); 160 257 mod_timer(&dev->delay_timer, jiffies + HZ); 161 258 return; 162 259 } 163 260 164 - mr->mmkey.type = MLX5_MKEY_MR; 165 - mr->mmkey.key |= mlx5_idx_to_mkey( 166 - MLX5_GET(create_mkey_out, mr->out, mkey_index)); 167 - init_waitqueue_head(&mr->mmkey.wait); 168 - 261 + mkey_out->mkey |= mlx5_idx_to_mkey( 262 + MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 169 263 WRITE_ONCE(dev->cache.last_add, jiffies); 170 264 171 - spin_lock_irqsave(&ent->lock, flags); 172 - list_add_tail(&mr->list, &ent->head); 173 - ent->available_mrs++; 174 - ent->total_mrs++; 265 + xa_lock_irqsave(&ent->mkeys, flags); 266 + push_to_reserved(ent, mkey_out->mkey); 175 267 /* If we are doing fill_to_high_water then keep going. */ 176 268 queue_adjust_cache_locked(ent); 177 - ent->pending--; 178 - spin_unlock_irqrestore(&ent->lock, flags); 269 + xa_unlock_irqrestore(&ent->mkeys, flags); 270 + kfree(mkey_out); 179 271 } 180 272 181 273 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) ··· 289 197 return ret; 290 198 } 291 199 292 - static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 200 + static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 293 201 { 294 - struct mlx5_ib_mr *mr; 295 - 296 - mr = kzalloc(sizeof(*mr), GFP_KERNEL); 297 - if (!mr) 298 - return NULL; 299 - mr->cache_ent = ent; 300 - 301 202 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 302 203 MLX5_SET(mkc, mkc, free, 1); 303 204 MLX5_SET(mkc, mkc, umr_en, 1); ··· 300 215 MLX5_SET(mkc, mkc, translations_octword_size, 301 216 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 302 217 MLX5_SET(mkc, mkc, log_page_size, ent->page); 303 - return mr; 304 218 } 305 219 306 220 /* Asynchronously schedule new MRs to be populated in the cache. */ 307 221 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 308 222 { 309 - size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 310 - struct mlx5_ib_mr *mr; 223 + struct mlx5r_async_create_mkey *async_create; 311 224 void *mkc; 312 - u32 *in; 313 225 int err = 0; 314 226 int i; 315 227 316 - in = kzalloc(inlen, GFP_KERNEL); 317 - if (!in) 318 - return -ENOMEM; 319 - 320 - mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 321 228 for (i = 0; i < num; i++) { 322 - mr = alloc_cache_mr(ent, mkc); 323 - if (!mr) { 324 - err = -ENOMEM; 325 - break; 326 - } 327 - spin_lock_irq(&ent->lock); 328 - if (ent->pending >= MAX_PENDING_REG_MR) { 329 - err = -EAGAIN; 330 - spin_unlock_irq(&ent->lock); 331 - kfree(mr); 332 - break; 333 - } 334 - ent->pending++; 335 - spin_unlock_irq(&ent->lock); 336 - err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 337 - &ent->dev->async_ctx, in, inlen, 338 - mr->out, sizeof(mr->out), 339 - &mr->cb_work); 229 + async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 230 + GFP_KERNEL); 231 + if (!async_create) 232 + return -ENOMEM; 233 + mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 234 + memory_key_mkey_entry); 235 + set_cache_mkc(ent, mkc); 236 + async_create->ent = ent; 237 + 238 + err = push_mkey(ent, true, NULL); 239 + if (err) 240 + goto free_async_create; 241 + 242 + err = mlx5_ib_create_mkey_cb(async_create); 340 243 if (err) { 341 - spin_lock_irq(&ent->lock); 342 - ent->pending--; 343 - spin_unlock_irq(&ent->lock); 344 244 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 345 - kfree(mr); 346 - break; 245 + goto err_undo_reserve; 347 246 } 348 247 } 349 248 350 - kfree(in); 249 + return 0; 250 + 251 + err_undo_reserve: 252 + xa_lock_irq(&ent->mkeys); 253 + undo_push_reserve_mkey(ent); 254 + xa_unlock_irq(&ent->mkeys); 255 + free_async_create: 256 + kfree(async_create); 351 257 return err; 352 258 } 353 259 354 260 /* Synchronously create a MR in the cache */ 355 - static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 261 + static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 356 262 { 357 263 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 358 - struct mlx5_ib_mr *mr; 359 264 void *mkc; 360 265 u32 *in; 361 266 int err; 362 267 363 268 in = kzalloc(inlen, GFP_KERNEL); 364 269 if (!in) 365 - return ERR_PTR(-ENOMEM); 270 + return -ENOMEM; 366 271 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 272 + set_cache_mkc(ent, mkc); 367 273 368 - mr = alloc_cache_mr(ent, mkc); 369 - if (!mr) { 370 - err = -ENOMEM; 371 - goto free_in; 372 - } 373 - 374 - err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); 274 + err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 375 275 if (err) 376 - goto free_mr; 276 + goto free_in; 377 277 378 - init_waitqueue_head(&mr->mmkey.wait); 379 - mr->mmkey.type = MLX5_MKEY_MR; 380 278 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 381 - spin_lock_irq(&ent->lock); 382 - ent->total_mrs++; 383 - spin_unlock_irq(&ent->lock); 384 - kfree(in); 385 - return mr; 386 - free_mr: 387 - kfree(mr); 388 279 free_in: 389 280 kfree(in); 390 - return ERR_PTR(err); 281 + return err; 391 282 } 392 283 393 284 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 394 285 { 395 - struct mlx5_ib_mr *mr; 286 + u32 mkey; 396 287 397 - lockdep_assert_held(&ent->lock); 398 - if (list_empty(&ent->head)) 288 + lockdep_assert_held(&ent->mkeys.xa_lock); 289 + if (!ent->stored) 399 290 return; 400 - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 401 - list_del(&mr->list); 402 - ent->available_mrs--; 403 - ent->total_mrs--; 404 - spin_unlock_irq(&ent->lock); 405 - mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); 406 - kfree(mr); 407 - spin_lock_irq(&ent->lock); 291 + mkey = pop_stored_mkey(ent); 292 + xa_unlock_irq(&ent->mkeys); 293 + mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 294 + xa_lock_irq(&ent->mkeys); 408 295 } 409 296 410 297 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 411 298 bool limit_fill) 299 + __acquires(&ent->mkeys) __releases(&ent->mkeys) 412 300 { 413 301 int err; 414 302 415 - lockdep_assert_held(&ent->lock); 303 + lockdep_assert_held(&ent->mkeys.xa_lock); 416 304 417 305 while (true) { 418 306 if (limit_fill) 419 307 target = ent->limit * 2; 420 - if (target == ent->available_mrs + ent->pending) 308 + if (target == ent->reserved) 421 309 return 0; 422 - if (target > ent->available_mrs + ent->pending) { 423 - u32 todo = target - (ent->available_mrs + ent->pending); 310 + if (target > ent->reserved) { 311 + u32 todo = target - ent->reserved; 424 312 425 - spin_unlock_irq(&ent->lock); 313 + xa_unlock_irq(&ent->mkeys); 426 314 err = add_keys(ent, todo); 427 315 if (err == -EAGAIN) 428 316 usleep_range(3000, 5000); 429 - spin_lock_irq(&ent->lock); 317 + xa_lock_irq(&ent->mkeys); 430 318 if (err) { 431 319 if (err != -EAGAIN) 432 320 return err; ··· 424 366 425 367 /* 426 368 * Target is the new value of total_mrs the user requests, however we 427 - * cannot free MRs that are in use. Compute the target value for 428 - * available_mrs. 369 + * cannot free MRs that are in use. Compute the target value for stored 370 + * mkeys. 429 371 */ 430 - spin_lock_irq(&ent->lock); 431 - if (target < ent->total_mrs - ent->available_mrs) { 372 + xa_lock_irq(&ent->mkeys); 373 + if (target < ent->in_use) { 432 374 err = -EINVAL; 433 375 goto err_unlock; 434 376 } 435 - target = target - (ent->total_mrs - ent->available_mrs); 377 + target = target - ent->in_use; 436 378 if (target < ent->limit || target > ent->limit*2) { 437 379 err = -EINVAL; 438 380 goto err_unlock; ··· 440 382 err = resize_available_mrs(ent, target, false); 441 383 if (err) 442 384 goto err_unlock; 443 - spin_unlock_irq(&ent->lock); 385 + xa_unlock_irq(&ent->mkeys); 444 386 445 387 return count; 446 388 447 389 err_unlock: 448 - spin_unlock_irq(&ent->lock); 390 + xa_unlock_irq(&ent->mkeys); 449 391 return err; 450 392 } 451 393 ··· 456 398 char lbuf[20]; 457 399 int err; 458 400 459 - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 401 + err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); 460 402 if (err < 0) 461 403 return err; 462 404 ··· 485 427 * Upon set we immediately fill the cache to high water mark implied by 486 428 * the limit. 487 429 */ 488 - spin_lock_irq(&ent->lock); 430 + xa_lock_irq(&ent->mkeys); 489 431 ent->limit = var; 490 432 err = resize_available_mrs(ent, 0, true); 491 - spin_unlock_irq(&ent->lock); 433 + xa_unlock_irq(&ent->mkeys); 492 434 if (err) 493 435 return err; 494 436 return count; ··· 515 457 .read = limit_read, 516 458 }; 517 459 518 - static bool someone_adding(struct mlx5_mr_cache *cache) 460 + static bool someone_adding(struct mlx5_mkey_cache *cache) 519 461 { 520 462 unsigned int i; 521 463 522 - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 464 + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 523 465 struct mlx5_cache_ent *ent = &cache->ent[i]; 524 466 bool ret; 525 467 526 - spin_lock_irq(&ent->lock); 527 - ret = ent->available_mrs < ent->limit; 528 - spin_unlock_irq(&ent->lock); 468 + xa_lock_irq(&ent->mkeys); 469 + ret = ent->stored < ent->limit; 470 + xa_unlock_irq(&ent->mkeys); 529 471 if (ret) 530 472 return true; 531 473 } ··· 539 481 */ 540 482 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 541 483 { 542 - lockdep_assert_held(&ent->lock); 484 + lockdep_assert_held(&ent->mkeys.xa_lock); 543 485 544 486 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 545 487 return; 546 - if (ent->available_mrs < ent->limit) { 488 + if (ent->stored < ent->limit) { 547 489 ent->fill_to_high_water = true; 548 490 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 549 491 } else if (ent->fill_to_high_water && 550 - ent->available_mrs + ent->pending < 2 * ent->limit) { 492 + ent->reserved < 2 * ent->limit) { 551 493 /* 552 494 * Once we start populating due to hitting a low water mark 553 495 * continue until we pass the high water mark. 554 496 */ 555 497 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 556 - } else if (ent->available_mrs == 2 * ent->limit) { 498 + } else if (ent->stored == 2 * ent->limit) { 557 499 ent->fill_to_high_water = false; 558 - } else if (ent->available_mrs > 2 * ent->limit) { 500 + } else if (ent->stored > 2 * ent->limit) { 559 501 /* Queue deletion of excess entries */ 560 502 ent->fill_to_high_water = false; 561 - if (ent->pending) 503 + if (ent->stored != ent->reserved) 562 504 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 563 505 msecs_to_jiffies(1000)); 564 506 else ··· 569 511 static void __cache_work_func(struct mlx5_cache_ent *ent) 570 512 { 571 513 struct mlx5_ib_dev *dev = ent->dev; 572 - struct mlx5_mr_cache *cache = &dev->cache; 514 + struct mlx5_mkey_cache *cache = &dev->cache; 573 515 int err; 574 516 575 - spin_lock_irq(&ent->lock); 517 + xa_lock_irq(&ent->mkeys); 576 518 if (ent->disabled) 577 519 goto out; 578 520 579 - if (ent->fill_to_high_water && 580 - ent->available_mrs + ent->pending < 2 * ent->limit && 521 + if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && 581 522 !READ_ONCE(dev->fill_delay)) { 582 - spin_unlock_irq(&ent->lock); 523 + xa_unlock_irq(&ent->mkeys); 583 524 err = add_keys(ent, 1); 584 - spin_lock_irq(&ent->lock); 525 + xa_lock_irq(&ent->mkeys); 585 526 if (ent->disabled) 586 527 goto out; 587 528 if (err) { 588 529 /* 589 - * EAGAIN only happens if pending is positive, so we 590 - * will be rescheduled from reg_mr_callback(). The only 530 + * EAGAIN only happens if there are pending MRs, so we 531 + * will be rescheduled when storing them. The only 591 532 * failure path here is ENOMEM. 592 533 */ 593 534 if (err != -EAGAIN) { ··· 598 541 msecs_to_jiffies(1000)); 599 542 } 600 543 } 601 - } else if (ent->available_mrs > 2 * ent->limit) { 544 + } else if (ent->stored > 2 * ent->limit) { 602 545 bool need_delay; 603 546 604 547 /* ··· 613 556 * the garbage collection work to try to run in next cycle, in 614 557 * order to free CPU resources to other tasks. 615 558 */ 616 - spin_unlock_irq(&ent->lock); 559 + xa_unlock_irq(&ent->mkeys); 617 560 need_delay = need_resched() || someone_adding(cache) || 618 561 !time_after(jiffies, 619 562 READ_ONCE(cache->last_add) + 300 * HZ); 620 - spin_lock_irq(&ent->lock); 563 + xa_lock_irq(&ent->mkeys); 621 564 if (ent->disabled) 622 565 goto out; 623 566 if (need_delay) { ··· 628 571 queue_adjust_cache_locked(ent); 629 572 } 630 573 out: 631 - spin_unlock_irq(&ent->lock); 574 + xa_unlock_irq(&ent->mkeys); 632 575 } 633 576 634 577 static void delayed_cache_work_func(struct work_struct *work) ··· 644 587 int access_flags) 645 588 { 646 589 struct mlx5_ib_mr *mr; 590 + int err; 647 591 648 - /* Matches access in alloc_cache_mr() */ 649 592 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) 650 593 return ERR_PTR(-EOPNOTSUPP); 651 594 652 - spin_lock_irq(&ent->lock); 653 - if (list_empty(&ent->head)) { 595 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 596 + if (!mr) 597 + return ERR_PTR(-ENOMEM); 598 + 599 + xa_lock_irq(&ent->mkeys); 600 + ent->in_use++; 601 + 602 + if (!ent->stored) { 654 603 queue_adjust_cache_locked(ent); 655 604 ent->miss++; 656 - spin_unlock_irq(&ent->lock); 657 - mr = create_cache_mr(ent); 658 - if (IS_ERR(mr)) 659 - return mr; 605 + xa_unlock_irq(&ent->mkeys); 606 + err = create_cache_mkey(ent, &mr->mmkey.key); 607 + if (err) { 608 + xa_lock_irq(&ent->mkeys); 609 + ent->in_use--; 610 + xa_unlock_irq(&ent->mkeys); 611 + kfree(mr); 612 + return ERR_PTR(err); 613 + } 660 614 } else { 661 - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 662 - list_del(&mr->list); 663 - ent->available_mrs--; 615 + mr->mmkey.key = pop_stored_mkey(ent); 664 616 queue_adjust_cache_locked(ent); 665 - spin_unlock_irq(&ent->lock); 666 - 667 - mlx5_clear_mr(mr); 617 + xa_unlock_irq(&ent->mkeys); 668 618 } 619 + mr->mmkey.cache_ent = ent; 620 + mr->mmkey.type = MLX5_MKEY_MR; 621 + init_waitqueue_head(&mr->mmkey.wait); 669 622 return mr; 670 - } 671 - 672 - static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 673 - { 674 - struct mlx5_cache_ent *ent = mr->cache_ent; 675 - 676 - WRITE_ONCE(dev->cache.last_add, jiffies); 677 - spin_lock_irq(&ent->lock); 678 - list_add_tail(&mr->list, &ent->head); 679 - ent->available_mrs++; 680 - queue_adjust_cache_locked(ent); 681 - spin_unlock_irq(&ent->lock); 682 623 } 683 624 684 625 static void clean_keys(struct mlx5_ib_dev *dev, int c) 685 626 { 686 - struct mlx5_mr_cache *cache = &dev->cache; 627 + struct mlx5_mkey_cache *cache = &dev->cache; 687 628 struct mlx5_cache_ent *ent = &cache->ent[c]; 688 - struct mlx5_ib_mr *tmp_mr; 689 - struct mlx5_ib_mr *mr; 690 - LIST_HEAD(del_list); 629 + u32 mkey; 691 630 692 631 cancel_delayed_work(&ent->dwork); 693 - while (1) { 694 - spin_lock_irq(&ent->lock); 695 - if (list_empty(&ent->head)) { 696 - spin_unlock_irq(&ent->lock); 697 - break; 698 - } 699 - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 700 - list_move(&mr->list, &del_list); 701 - ent->available_mrs--; 702 - ent->total_mrs--; 703 - spin_unlock_irq(&ent->lock); 704 - mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 632 + xa_lock_irq(&ent->mkeys); 633 + while (ent->stored) { 634 + mkey = pop_stored_mkey(ent); 635 + xa_unlock_irq(&ent->mkeys); 636 + mlx5_core_destroy_mkey(dev->mdev, mkey); 637 + xa_lock_irq(&ent->mkeys); 705 638 } 706 - 707 - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 708 - list_del(&mr->list); 709 - kfree(mr); 710 - } 639 + xa_unlock_irq(&ent->mkeys); 711 640 } 712 641 713 - static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 642 + static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 714 643 { 715 644 if (!mlx5_debugfs_root || dev->is_rep) 716 645 return; ··· 705 662 dev->cache.root = NULL; 706 663 } 707 664 708 - static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 665 + static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 709 666 { 710 - struct mlx5_mr_cache *cache = &dev->cache; 667 + struct mlx5_mkey_cache *cache = &dev->cache; 711 668 struct mlx5_cache_ent *ent; 712 669 struct dentry *dir; 713 670 int i; ··· 717 674 718 675 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 719 676 720 - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 677 + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 721 678 ent = &cache->ent[i]; 722 679 sprintf(ent->name, "%d", ent->order); 723 680 dir = debugfs_create_dir(ent->name, cache->root); 724 681 debugfs_create_file("size", 0600, dir, ent, &size_fops); 725 682 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 726 - debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 683 + debugfs_create_ulong("cur", 0400, dir, &ent->stored); 727 684 debugfs_create_u32("miss", 0600, dir, &ent->miss); 728 685 } 729 686 } ··· 735 692 WRITE_ONCE(dev->fill_delay, 0); 736 693 } 737 694 738 - int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 695 + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 739 696 { 740 - struct mlx5_mr_cache *cache = &dev->cache; 697 + struct mlx5_mkey_cache *cache = &dev->cache; 741 698 struct mlx5_cache_ent *ent; 742 699 int i; 743 700 ··· 750 707 751 708 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 752 709 timer_setup(&dev->delay_timer, delay_time_func, 0); 753 - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 710 + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 754 711 ent = &cache->ent[i]; 755 - INIT_LIST_HEAD(&ent->head); 756 - spin_lock_init(&ent->lock); 712 + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); 757 713 ent->order = i + 2; 758 714 ent->dev = dev; 759 715 ent->limit = 0; 760 716 761 717 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 762 718 763 - if (i > MR_CACHE_LAST_STD_ENTRY) { 764 - mlx5_odp_init_mr_cache_entry(ent); 719 + if (i > MKEY_CACHE_LAST_STD_ENTRY) { 720 + mlx5_odp_init_mkey_cache_entry(ent); 765 721 continue; 766 722 } 767 723 768 - if (ent->order > mr_cache_max_order(dev)) 724 + if (ent->order > mkey_cache_max_order(dev)) 769 725 continue; 770 726 771 727 ent->page = PAGE_SHIFT; ··· 776 734 ent->limit = dev->mdev->profile.mr_cache[i].limit; 777 735 else 778 736 ent->limit = 0; 779 - spin_lock_irq(&ent->lock); 737 + xa_lock_irq(&ent->mkeys); 780 738 queue_adjust_cache_locked(ent); 781 - spin_unlock_irq(&ent->lock); 739 + xa_unlock_irq(&ent->mkeys); 782 740 } 783 741 784 - mlx5_mr_cache_debugfs_init(dev); 742 + mlx5_mkey_cache_debugfs_init(dev); 785 743 786 744 return 0; 787 745 } 788 746 789 - int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 747 + int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 790 748 { 791 749 unsigned int i; 792 750 793 751 if (!dev->cache.wq) 794 752 return 0; 795 753 796 - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 754 + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 797 755 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 798 756 799 - spin_lock_irq(&ent->lock); 757 + xa_lock_irq(&ent->mkeys); 800 758 ent->disabled = true; 801 - spin_unlock_irq(&ent->lock); 759 + xa_unlock_irq(&ent->mkeys); 802 760 cancel_delayed_work_sync(&ent->dwork); 803 761 } 804 762 805 - mlx5_mr_cache_debugfs_cleanup(dev); 763 + mlx5_mkey_cache_debugfs_cleanup(dev); 806 764 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 807 765 808 - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 766 + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) 809 767 clean_keys(dev, i); 810 768 811 769 destroy_workqueue(dev->cache.wq); ··· 872 830 return (npages + 1) / 2; 873 831 } 874 832 875 - static int mr_cache_max_order(struct mlx5_ib_dev *dev) 833 + static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 876 834 { 877 835 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 878 - return MR_CACHE_LAST_STD_ENTRY + 2; 836 + return MKEY_CACHE_LAST_STD_ENTRY + 2; 879 837 return MLX5_MAX_UMR_SHIFT; 880 838 } 881 839 882 - static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 883 - unsigned int order) 840 + static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, 841 + unsigned int order) 884 842 { 885 - struct mlx5_mr_cache *cache = &dev->cache; 843 + struct mlx5_mkey_cache *cache = &dev->cache; 886 844 887 845 if (order < cache->ent[0].order) 888 846 return &cache->ent[0]; 889 847 order = order - cache->ent[0].order; 890 - if (order > MR_CACHE_LAST_STD_ENTRY) 848 + if (order > MKEY_CACHE_LAST_STD_ENTRY) 891 849 return NULL; 892 850 return &cache->ent[order]; 893 851 } ··· 930 888 0, iova); 931 889 if (WARN_ON(!page_size)) 932 890 return ERR_PTR(-EINVAL); 933 - ent = mr_cache_ent_from_order( 891 + ent = mkey_cache_ent_from_order( 934 892 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 935 893 /* 936 894 * Matches access in alloc_cache_mr(). If the MR can't come from the ··· 1362 1320 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1363 1321 1364 1322 /* We only track the allocated sizes of MRs from the cache */ 1365 - if (!mr->cache_ent) 1323 + if (!mr->mmkey.cache_ent) 1366 1324 return false; 1367 1325 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1368 1326 return false; ··· 1371 1329 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1372 1330 if (WARN_ON(!*page_size)) 1373 1331 return false; 1374 - return (1ULL << mr->cache_ent->order) >= 1332 + return (1ULL << mr->mmkey.cache_ent->order) >= 1375 1333 ib_umem_num_dma_blocks(new_umem, *page_size); 1376 1334 } 1377 1335 ··· 1612 1570 } 1613 1571 1614 1572 /* Stop DMA */ 1615 - if (mr->cache_ent) { 1616 - if (mlx5r_umr_revoke_mr(mr)) { 1617 - spin_lock_irq(&mr->cache_ent->lock); 1618 - mr->cache_ent->total_mrs--; 1619 - spin_unlock_irq(&mr->cache_ent->lock); 1620 - mr->cache_ent = NULL; 1621 - } 1573 + if (mr->mmkey.cache_ent) { 1574 + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1575 + mr->mmkey.cache_ent->in_use--; 1576 + xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); 1577 + 1578 + if (mlx5r_umr_revoke_mr(mr) || 1579 + push_mkey(mr->mmkey.cache_ent, false, 1580 + xa_mk_value(mr->mmkey.key))) 1581 + mr->mmkey.cache_ent = NULL; 1622 1582 } 1623 - if (!mr->cache_ent) { 1583 + if (!mr->mmkey.cache_ent) { 1624 1584 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1625 1585 if (rc) 1626 1586 return rc; ··· 1639 1595 mlx5_ib_free_odp_mr(mr); 1640 1596 } 1641 1597 1642 - if (mr->cache_ent) { 1643 - mlx5_mr_cache_free(dev, mr); 1644 - } else { 1598 + if (!mr->mmkey.cache_ent) 1645 1599 mlx5_free_priv_descs(mr); 1646 - kfree(mr); 1647 - } 1600 + 1601 + kfree(mr); 1648 1602 return 0; 1649 1603 } 1650 1604
+1 -1
drivers/infiniband/hw/mlx5/odp.c
··· 1588 1588 return err; 1589 1589 } 1590 1590 1591 - void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) 1591 + void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) 1592 1592 { 1593 1593 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1594 1594 return;
+68 -10
drivers/infiniband/hw/mlx5/umr.c
··· 176 176 dev->umrc.pd = pd; 177 177 178 178 sema_init(&dev->umrc.sem, MAX_UMR_WR); 179 + mutex_init(&dev->umrc.lock); 179 180 180 181 return 0; 181 182 ··· 194 193 ib_destroy_qp(dev->umrc.qp); 195 194 ib_free_cq(dev->umrc.cq); 196 195 ib_dealloc_pd(dev->umrc.pd); 196 + } 197 + 198 + static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) 199 + { 200 + struct umr_common *umrc = &dev->umrc; 201 + struct ib_qp_attr attr; 202 + int err; 203 + 204 + attr.qp_state = IB_QPS_RESET; 205 + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); 206 + if (err) { 207 + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); 208 + goto err; 209 + } 210 + 211 + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); 212 + if (err) 213 + goto err; 214 + 215 + umrc->state = MLX5_UMR_STATE_ACTIVE; 216 + return 0; 217 + 218 + err: 219 + umrc->state = MLX5_UMR_STATE_ERR; 220 + return err; 197 221 } 198 222 199 223 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, ··· 257 231 258 232 id.ib_cqe = cqe; 259 233 mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, 260 - MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); 234 + MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); 261 235 262 236 mlx5r_ring_db(qp, 1, ctrl); 263 237 ··· 296 270 mlx5r_umr_init_context(&umr_context); 297 271 298 272 down(&umrc->sem); 299 - err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, 300 - with_data); 301 - if (err) 302 - mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); 303 - else { 304 - wait_for_completion(&umr_context.done); 305 - if (umr_context.status != IB_WC_SUCCESS) { 306 - mlx5_ib_warn(dev, "reg umr failed (%u)\n", 307 - umr_context.status); 273 + while (true) { 274 + mutex_lock(&umrc->lock); 275 + if (umrc->state == MLX5_UMR_STATE_ERR) { 276 + mutex_unlock(&umrc->lock); 308 277 err = -EFAULT; 278 + break; 309 279 } 280 + 281 + if (umrc->state == MLX5_UMR_STATE_RECOVER) { 282 + mutex_unlock(&umrc->lock); 283 + usleep_range(3000, 5000); 284 + continue; 285 + } 286 + 287 + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, 288 + with_data); 289 + mutex_unlock(&umrc->lock); 290 + if (err) { 291 + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", 292 + err); 293 + break; 294 + } 295 + 296 + wait_for_completion(&umr_context.done); 297 + 298 + if (umr_context.status == IB_WC_SUCCESS) 299 + break; 300 + 301 + if (umr_context.status == IB_WC_WR_FLUSH_ERR) 302 + continue; 303 + 304 + WARN_ON_ONCE(1); 305 + mlx5_ib_warn(dev, 306 + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", 307 + umr_context.status); 308 + mutex_lock(&umrc->lock); 309 + err = mlx5r_umr_recover(dev); 310 + mutex_unlock(&umrc->lock); 311 + if (err) 312 + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", 313 + err); 314 + err = -EFAULT; 315 + break; 310 316 } 311 317 up(&umrc->sem); 312 318 return err;
+5 -3
drivers/infiniband/hw/qedr/verbs.c
··· 3084 3084 else 3085 3085 DP_ERR(dev, "roce alloc tid returned error %d\n", rc); 3086 3086 3087 - goto err0; 3087 + goto err1; 3088 3088 } 3089 3089 3090 3090 /* Index only, 18 bit long, lkey = itid << 8 | key */ ··· 3108 3108 rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); 3109 3109 if (rc) { 3110 3110 DP_ERR(dev, "roce register tid returned an error %d\n", rc); 3111 - goto err1; 3111 + goto err2; 3112 3112 } 3113 3113 3114 3114 mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; ··· 3117 3117 DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey); 3118 3118 return mr; 3119 3119 3120 - err1: 3120 + err2: 3121 3121 dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); 3122 + err1: 3123 + qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); 3122 3124 err0: 3123 3125 kfree(mr); 3124 3126 return ERR_PTR(rc);
+1 -1
drivers/infiniband/hw/qib/qib.h
··· 321 321 * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed 322 322 * negotiation) are used for the 3rd argument to path_f_set_ib_cfg 323 323 * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They 324 - * are also the the possible values for qib_link_speed_enabled and active 324 + * are also the possible values for qib_link_speed_enabled and active 325 325 * The values were chosen to match values used within the IB spec. 326 326 */ 327 327 #define QIB_IB_SDR 1
+3 -3
drivers/infiniband/hw/qib/qib_file_ops.c
··· 153 153 kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; 154 154 /* 155 155 * for this use, may be cfgctxts summed over all chips that 156 - * are are configured and present 156 + * are configured and present 157 157 */ 158 158 kinfo->spi_nctxts = dd->cfgctxts; 159 159 /* unit (chip/board) our context is on */ ··· 851 851 ret = -EPERM; 852 852 goto bail; 853 853 } 854 - /* don't allow them to later change to writeable with mprotect */ 854 + /* don't allow them to later change to writable with mprotect */ 855 855 vma->vm_flags &= ~VM_MAYWRITE; 856 856 857 857 start = vma->vm_start; ··· 941 941 goto bail; 942 942 } 943 943 /* 944 - * Don't allow permission to later change to writeable 944 + * Don't allow permission to later change to writable 945 945 * with mprotect. 946 946 */ 947 947 vma->vm_flags &= ~VM_MAYWRITE;
+1 -1
drivers/infiniband/hw/qib/qib_iba7220.c
··· 58 58 /* 59 59 * This file contains almost all the chip-specific register information and 60 60 * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the 61 - * exception of SerDes support, which in in qib_sd7220.c. 61 + * exception of SerDes support, which in qib_sd7220.c. 62 62 */ 63 63 64 64 /* Below uses machine-generated qib_chipnum_regs.h file */
+8 -15
drivers/infiniband/hw/qib/qib_iba7322.c
··· 2850 2850 2851 2851 qib_7322_free_irq(dd); 2852 2852 kfree(dd->cspec->cntrs); 2853 - kfree(dd->cspec->sendchkenable); 2854 - kfree(dd->cspec->sendgrhchk); 2855 - kfree(dd->cspec->sendibchk); 2853 + bitmap_free(dd->cspec->sendchkenable); 2854 + bitmap_free(dd->cspec->sendgrhchk); 2855 + bitmap_free(dd->cspec->sendibchk); 2856 2856 kfree(dd->cspec->msix_entries); 2857 2857 for (i = 0; i < dd->num_pports; i++) { 2858 2858 unsigned long flags; ··· 6383 6383 features = qib_7322_boardname(dd); 6384 6384 6385 6385 /* now that piobcnt2k and 4k set, we can allocate these */ 6386 - sbufcnt = dd->piobcnt2k + dd->piobcnt4k + 6387 - NUM_VL15_BUFS + BITS_PER_LONG - 1; 6388 - sbufcnt /= BITS_PER_LONG; 6389 - dd->cspec->sendchkenable = 6390 - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable), 6391 - GFP_KERNEL); 6392 - dd->cspec->sendgrhchk = 6393 - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk), 6394 - GFP_KERNEL); 6395 - dd->cspec->sendibchk = 6396 - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk), 6397 - GFP_KERNEL); 6386 + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; 6387 + 6388 + dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL); 6389 + dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); 6390 + dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); 6398 6391 if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || 6399 6392 !dd->cspec->sendibchk) { 6400 6393 ret = -ENOMEM;
+2 -3
drivers/infiniband/hw/qib/qib_init.c
··· 1106 1106 if (!qib_cpulist_count) { 1107 1107 u32 count = num_online_cpus(); 1108 1108 1109 - qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), 1110 - GFP_KERNEL); 1109 + qib_cpulist = bitmap_zalloc(count, GFP_KERNEL); 1111 1110 if (qib_cpulist) 1112 1111 qib_cpulist_count = count; 1113 1112 } ··· 1278 1279 #endif 1279 1280 1280 1281 qib_cpulist_count = 0; 1281 - kfree(qib_cpulist); 1282 + bitmap_free(qib_cpulist); 1282 1283 1283 1284 WARN_ON(!xa_empty(&qib_dev_table)); 1284 1285 qib_dev_cleanup();
+1 -1
drivers/infiniband/hw/qib/qib_sd7220.c
··· 587 587 /* Need to release */ 588 588 u64 pollval; 589 589 /* 590 - * The only writeable bits are the request and CS. 590 + * The only writable bits are the request and CS. 591 591 * Both should be clear 592 592 */ 593 593 u64 newval = 0;
+1 -1
drivers/infiniband/hw/usnic/usnic_uiom.c
··· 482 482 if (err) 483 483 goto out_free_dev; 484 484 485 - if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { 485 + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { 486 486 usnic_err("IOMMU of %s does not support cache coherency\n", 487 487 dev_name(dev)); 488 488 err = -EINVAL;
+28 -21
drivers/infiniband/sw/rxe/rxe_comp.c
··· 114 114 { 115 115 struct rxe_qp *qp = from_timer(qp, t, retrans_timer); 116 116 117 + pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index); 118 + 117 119 if (qp->valid) { 118 120 qp->comp.timeout = 1; 119 121 rxe_run_task(&qp->comp.task, 1); ··· 562 560 struct sk_buff *skb = NULL; 563 561 struct rxe_pkt_info *pkt = NULL; 564 562 enum comp_state state; 565 - int ret = 0; 563 + int ret; 566 564 567 565 if (!rxe_get(qp)) 568 566 return -EAGAIN; 569 567 570 - if (!qp->valid || qp->req.state == QP_STATE_ERROR || 571 - qp->req.state == QP_STATE_RESET) { 568 + if (!qp->valid || qp->comp.state == QP_STATE_ERROR || 569 + qp->comp.state == QP_STATE_RESET) { 572 570 rxe_drain_resp_pkts(qp, qp->valid && 573 - qp->req.state == QP_STATE_ERROR); 574 - ret = -EAGAIN; 575 - goto done; 571 + qp->comp.state == QP_STATE_ERROR); 572 + goto exit; 576 573 } 577 574 578 575 if (qp->comp.timeout) { ··· 581 580 qp->comp.timeout_retry = 0; 582 581 } 583 582 584 - if (qp->req.need_retry) { 585 - ret = -EAGAIN; 586 - goto done; 587 - } 583 + if (qp->req.need_retry) 584 + goto exit; 588 585 589 586 state = COMPST_GET_ACK; 590 587 ··· 675 676 qp->qp_timeout_jiffies) 676 677 mod_timer(&qp->retrans_timer, 677 678 jiffies + qp->qp_timeout_jiffies); 678 - ret = -EAGAIN; 679 - goto done; 679 + goto exit; 680 680 681 681 case COMPST_ERROR_RETRY: 682 682 /* we come here if the retry timer fired and we did ··· 687 689 */ 688 690 689 691 /* there is nothing to retry in this case */ 690 - if (!wqe || (wqe->state == wqe_state_posted)) { 691 - ret = -EAGAIN; 692 - goto done; 693 - } 692 + if (!wqe || (wqe->state == wqe_state_posted)) 693 + goto exit; 694 694 695 695 /* if we've started a retry, don't start another 696 696 * retry sequence, unless this is a timeout. ··· 726 730 break; 727 731 728 732 case COMPST_RNR_RETRY: 733 + /* we come here if we received an RNR NAK */ 729 734 if (qp->comp.rnr_retry > 0) { 730 735 if (qp->comp.rnr_retry != 7) 731 736 qp->comp.rnr_retry--; 732 737 733 - qp->req.need_retry = 1; 738 + /* don't start a retry flow until the 739 + * rnr timer has fired 740 + */ 741 + qp->req.wait_for_rnr_timer = 1; 734 742 pr_debug("qp#%d set rnr nak timer\n", 735 743 qp_num(qp)); 736 744 mod_timer(&qp->rnr_nak_timer, 737 745 jiffies + rnrnak_jiffies(aeth_syn(pkt) 738 746 & ~AETH_TYPE_MASK)); 739 - ret = -EAGAIN; 740 - goto done; 747 + goto exit; 741 748 } else { 742 749 rxe_counter_inc(rxe, 743 750 RXE_CNT_RNR_RETRY_EXCEEDED); ··· 753 754 WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS); 754 755 do_complete(qp, wqe); 755 756 rxe_qp_error(qp); 756 - ret = -EAGAIN; 757 - goto done; 757 + goto exit; 758 758 } 759 759 } 760 760 761 + /* A non-zero return value will cause rxe_do_task to 762 + * exit its loop and end the tasklet. A zero return 763 + * will continue looping and return to rxe_completer 764 + */ 761 765 done: 766 + ret = 0; 767 + goto out; 768 + exit: 769 + ret = -EAGAIN; 770 + out: 762 771 if (pkt) 763 772 free_pkt(pkt); 764 773 rxe_put(qp);
+4 -4
drivers/infiniband/sw/rxe/rxe_cq.c
··· 19 19 } 20 20 21 21 if (cqe > rxe->attr.max_cqe) { 22 - pr_warn("cqe(%d) > max_cqe(%d)\n", 23 - cqe, rxe->attr.max_cqe); 22 + pr_debug("cqe(%d) > max_cqe(%d)\n", 23 + cqe, rxe->attr.max_cqe); 24 24 goto err1; 25 25 } 26 26 27 27 if (cq) { 28 28 count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); 29 29 if (cqe < count) { 30 - pr_warn("cqe(%d) < current # elements in queue (%d)", 31 - cqe, count); 30 + pr_debug("cqe(%d) < current # elements in queue (%d)", 31 + cqe, count); 32 32 goto err1; 33 33 } 34 34 }
+2 -3
drivers/infiniband/sw/rxe/rxe_loc.h
··· 77 77 enum rxe_mr_lookup_type type); 78 78 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); 79 79 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); 80 - int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey); 80 + int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); 81 81 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); 82 - int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr); 83 82 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); 84 83 void rxe_mr_cleanup(struct rxe_pool_elem *elem); 85 84 ··· 144 145 max_sge * sizeof(struct ib_sge); 145 146 } 146 147 147 - void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); 148 + void free_rd_atomic_resource(struct resp_res *res); 148 149 149 150 static inline void rxe_advance_resp_resource(struct rxe_qp *qp) 150 151 {
+73 -140
drivers/infiniband/sw/rxe/rxe_mr.c
··· 24 24 25 25 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) 26 26 { 27 - struct rxe_map_set *set = mr->cur_map_set; 27 + 28 28 29 29 switch (mr->type) { 30 30 case IB_MR_TYPE_DMA: ··· 32 32 33 33 case IB_MR_TYPE_USER: 34 34 case IB_MR_TYPE_MEM_REG: 35 - if (iova < set->iova || length > set->length || 36 - iova > set->iova + set->length - length) 35 + if (iova < mr->iova || length > mr->length || 36 + iova > mr->iova + mr->length - length) 37 37 return -EFAULT; 38 38 return 0; 39 39 ··· 65 65 mr->map_shift = ilog2(RXE_BUF_PER_MAP); 66 66 } 67 67 68 - static void rxe_mr_free_map_set(int num_map, struct rxe_map_set *set) 68 + static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 69 69 { 70 70 int i; 71 + int num_map; 72 + struct rxe_map **map = mr->map; 71 73 72 - for (i = 0; i < num_map; i++) 73 - kfree(set->map[i]); 74 + num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; 74 75 75 - kfree(set->map); 76 - kfree(set); 77 - } 78 - 79 - static int rxe_mr_alloc_map_set(int num_map, struct rxe_map_set **setp) 80 - { 81 - int i; 82 - struct rxe_map_set *set; 83 - 84 - set = kmalloc(sizeof(*set), GFP_KERNEL); 85 - if (!set) 86 - goto err_out; 87 - 88 - set->map = kmalloc_array(num_map, sizeof(struct rxe_map *), GFP_KERNEL); 89 - if (!set->map) 90 - goto err_free_set; 76 + mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); 77 + if (!mr->map) 78 + goto err1; 91 79 92 80 for (i = 0; i < num_map; i++) { 93 - set->map[i] = kmalloc(sizeof(struct rxe_map), GFP_KERNEL); 94 - if (!set->map[i]) 95 - goto err_free_map; 81 + mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); 82 + if (!mr->map[i]) 83 + goto err2; 96 84 } 97 85 98 - *setp = set; 99 - 100 - return 0; 101 - 102 - err_free_map: 103 - for (i--; i >= 0; i--) 104 - kfree(set->map[i]); 105 - 106 - kfree(set->map); 107 - err_free_set: 108 - kfree(set); 109 - err_out: 110 - return -ENOMEM; 111 - } 112 - 113 - /** 114 - * rxe_mr_alloc() - Allocate memory map array(s) for MR 115 - * @mr: Memory region 116 - * @num_buf: Number of buffer descriptors to support 117 - * @both: If non zero allocate both mr->map and mr->next_map 118 - * else just allocate mr->map. Used for fast MRs 119 - * 120 - * Return: 0 on success else an error 121 - */ 122 - static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf, int both) 123 - { 124 - int ret; 125 - int num_map; 126 - 127 86 BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); 128 - num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; 129 87 130 88 mr->map_shift = ilog2(RXE_BUF_PER_MAP); 131 89 mr->map_mask = RXE_BUF_PER_MAP - 1; 90 + 132 91 mr->num_buf = num_buf; 133 - mr->max_buf = num_map * RXE_BUF_PER_MAP; 134 92 mr->num_map = num_map; 135 - 136 - ret = rxe_mr_alloc_map_set(num_map, &mr->cur_map_set); 137 - if (ret) 138 - return -ENOMEM; 139 - 140 - if (both) { 141 - ret = rxe_mr_alloc_map_set(num_map, &mr->next_map_set); 142 - if (ret) 143 - goto err_free; 144 - } 93 + mr->max_buf = num_map * RXE_BUF_PER_MAP; 145 94 146 95 return 0; 147 96 148 - err_free: 149 - rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); 150 - mr->cur_map_set = NULL; 97 + err2: 98 + for (i--; i >= 0; i--) 99 + kfree(mr->map[i]); 100 + 101 + kfree(mr->map); 102 + err1: 151 103 return -ENOMEM; 152 104 } 153 105 ··· 116 164 int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, 117 165 int access, struct rxe_mr *mr) 118 166 { 119 - struct rxe_map_set *set; 120 167 struct rxe_map **map; 121 168 struct rxe_phys_buf *buf = NULL; 122 169 struct ib_umem *umem; ··· 123 172 int num_buf; 124 173 void *vaddr; 125 174 int err; 175 + int i; 126 176 127 177 umem = ib_umem_get(pd->ibpd.device, start, length, access); 128 178 if (IS_ERR(umem)) { ··· 137 185 138 186 rxe_mr_init(access, mr); 139 187 140 - err = rxe_mr_alloc(mr, num_buf, 0); 188 + err = rxe_mr_alloc(mr, num_buf); 141 189 if (err) { 142 190 pr_warn("%s: Unable to allocate memory for map\n", 143 191 __func__); 144 192 goto err_release_umem; 145 193 } 146 194 147 - set = mr->cur_map_set; 148 - set->page_shift = PAGE_SHIFT; 149 - set->page_mask = PAGE_SIZE - 1; 195 + mr->page_shift = PAGE_SHIFT; 196 + mr->page_mask = PAGE_SIZE - 1; 150 197 151 - num_buf = 0; 152 - map = set->map; 153 - 198 + num_buf = 0; 199 + map = mr->map; 154 200 if (length > 0) { 155 201 buf = map[0]->buf; 156 202 ··· 164 214 pr_warn("%s: Unable to get virtual address\n", 165 215 __func__); 166 216 err = -ENOMEM; 167 - goto err_release_umem; 217 + goto err_cleanup_map; 168 218 } 169 219 170 220 buf->addr = (uintptr_t)vaddr; 171 221 buf->size = PAGE_SIZE; 172 222 num_buf++; 173 223 buf++; 224 + 174 225 } 175 226 } 176 227 177 228 mr->ibmr.pd = &pd->ibpd; 178 229 mr->umem = umem; 179 230 mr->access = access; 231 + mr->length = length; 232 + mr->iova = iova; 233 + mr->va = start; 234 + mr->offset = ib_umem_offset(umem); 180 235 mr->state = RXE_MR_STATE_VALID; 181 236 mr->type = IB_MR_TYPE_USER; 182 237 183 - set->length = length; 184 - set->iova = iova; 185 - set->va = start; 186 - set->offset = ib_umem_offset(umem); 187 - 188 238 return 0; 189 239 240 + err_cleanup_map: 241 + for (i = 0; i < mr->num_map; i++) 242 + kfree(mr->map[i]); 243 + kfree(mr->map); 190 244 err_release_umem: 191 245 ib_umem_release(umem); 192 246 err_out: ··· 204 250 /* always allow remote access for FMRs */ 205 251 rxe_mr_init(IB_ACCESS_REMOTE, mr); 206 252 207 - err = rxe_mr_alloc(mr, max_pages, 1); 253 + err = rxe_mr_alloc(mr, max_pages); 208 254 if (err) 209 255 goto err1; 210 256 ··· 222 268 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, 223 269 size_t *offset_out) 224 270 { 225 - struct rxe_map_set *set = mr->cur_map_set; 226 - size_t offset = iova - set->iova + set->offset; 271 + size_t offset = iova - mr->iova + mr->offset; 227 272 int map_index; 228 273 int buf_index; 229 274 u64 length; 230 - struct rxe_map *map; 231 275 232 - if (likely(set->page_shift)) { 233 - *offset_out = offset & set->page_mask; 234 - offset >>= set->page_shift; 276 + if (likely(mr->page_shift)) { 277 + *offset_out = offset & mr->page_mask; 278 + offset >>= mr->page_shift; 235 279 *n_out = offset & mr->map_mask; 236 280 *m_out = offset >> mr->map_shift; 237 281 } else { 238 282 map_index = 0; 239 283 buf_index = 0; 240 284 241 - map = set->map[map_index]; 242 - length = map->buf[buf_index].size; 285 + length = mr->map[map_index]->buf[buf_index].size; 243 286 244 287 while (offset >= length) { 245 288 offset -= length; ··· 246 295 map_index++; 247 296 buf_index = 0; 248 297 } 249 - map = set->map[map_index]; 250 - length = map->buf[buf_index].size; 298 + length = mr->map[map_index]->buf[buf_index].size; 251 299 } 252 300 253 301 *m_out = map_index; ··· 267 317 goto out; 268 318 } 269 319 270 - if (!mr->cur_map_set) { 320 + if (!mr->map) { 271 321 addr = (void *)(uintptr_t)iova; 272 322 goto out; 273 323 } ··· 280 330 281 331 lookup_iova(mr, iova, &m, &n, &offset); 282 332 283 - if (offset + length > mr->cur_map_set->map[m]->buf[n].size) { 333 + if (offset + length > mr->map[m]->buf[n].size) { 284 334 pr_warn("crosses page boundary\n"); 285 335 addr = NULL; 286 336 goto out; 287 337 } 288 338 289 - addr = (void *)(uintptr_t)mr->cur_map_set->map[m]->buf[n].addr + offset; 339 + addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset; 290 340 291 341 out: 292 342 return addr; ··· 322 372 return 0; 323 373 } 324 374 325 - WARN_ON_ONCE(!mr->cur_map_set); 375 + WARN_ON_ONCE(!mr->map); 326 376 327 377 err = mr_check_range(mr, iova, length); 328 378 if (err) { ··· 332 382 333 383 lookup_iova(mr, iova, &m, &i, &offset); 334 384 335 - map = mr->cur_map_set->map + m; 385 + map = mr->map + m; 336 386 buf = map[0]->buf + i; 337 387 338 388 while (length > 0) { ··· 526 576 return mr; 527 577 } 528 578 529 - int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey) 579 + int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) 530 580 { 531 581 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 532 582 struct rxe_mr *mr; 533 583 int ret; 534 584 535 - mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); 585 + mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); 536 586 if (!mr) { 537 - pr_err("%s: No MR for rkey %#x\n", __func__, rkey); 587 + pr_err("%s: No MR for key %#x\n", __func__, key); 538 588 ret = -EINVAL; 539 589 goto err; 540 590 } 541 591 542 - if (rkey != mr->rkey) { 543 - pr_err("%s: rkey (%#x) doesn't match mr->rkey (%#x)\n", 544 - __func__, rkey, mr->rkey); 592 + if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { 593 + pr_err("%s: wr key (%#x) doesn't match mr key (%#x)\n", 594 + __func__, key, (mr->rkey ? mr->rkey : mr->lkey)); 545 595 ret = -EINVAL; 546 596 goto err_drop_ref; 547 597 } ··· 578 628 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 579 629 { 580 630 struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); 581 - u32 key = wqe->wr.wr.reg.key & 0xff; 631 + u32 key = wqe->wr.wr.reg.key; 582 632 u32 access = wqe->wr.wr.reg.access; 583 - struct rxe_map_set *set; 584 633 585 634 /* user can only register MR in free state */ 586 635 if (unlikely(mr->state != RXE_MR_STATE_FREE)) { ··· 595 646 return -EINVAL; 596 647 } 597 648 649 + /* user is only allowed to change key portion of l/rkey */ 650 + if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { 651 + pr_warn("%s: key = 0x%x has wrong index mr->lkey = 0x%x\n", 652 + __func__, key, mr->lkey); 653 + return -EINVAL; 654 + } 655 + 598 656 mr->access = access; 599 - mr->lkey = (mr->lkey & ~0xff) | key; 600 - mr->rkey = (access & IB_ACCESS_REMOTE) ? mr->lkey : 0; 657 + mr->lkey = key; 658 + mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; 659 + mr->iova = wqe->wr.wr.reg.mr->iova; 601 660 mr->state = RXE_MR_STATE_VALID; 602 - 603 - set = mr->cur_map_set; 604 - mr->cur_map_set = mr->next_map_set; 605 - mr->cur_map_set->iova = wqe->wr.wr.reg.mr->iova; 606 - mr->next_map_set = set; 607 - 608 - return 0; 609 - } 610 - 611 - int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr) 612 - { 613 - struct rxe_mr *mr = to_rmr(ibmr); 614 - struct rxe_map_set *set = mr->next_map_set; 615 - struct rxe_map *map; 616 - struct rxe_phys_buf *buf; 617 - 618 - if (unlikely(set->nbuf == mr->num_buf)) 619 - return -ENOMEM; 620 - 621 - map = set->map[set->nbuf / RXE_BUF_PER_MAP]; 622 - buf = &map->buf[set->nbuf % RXE_BUF_PER_MAP]; 623 - 624 - buf->addr = addr; 625 - buf->size = ibmr->page_size; 626 - set->nbuf++; 627 661 628 662 return 0; 629 663 } ··· 619 687 if (atomic_read(&mr->num_mw) > 0) 620 688 return -EINVAL; 621 689 622 - rxe_put(mr); 690 + rxe_cleanup(mr); 623 691 624 692 return 0; 625 693 } ··· 627 695 void rxe_mr_cleanup(struct rxe_pool_elem *elem) 628 696 { 629 697 struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); 698 + int i; 630 699 631 700 rxe_put(mr_pd(mr)); 632 - 633 701 ib_umem_release(mr->umem); 634 702 635 - if (mr->cur_map_set) 636 - rxe_mr_free_map_set(mr->num_map, mr->cur_map_set); 703 + if (mr->map) { 704 + for (i = 0; i < mr->num_map; i++) 705 + kfree(mr->map[i]); 637 706 638 - if (mr->next_map_set) 639 - rxe_mr_free_map_set(mr->num_map, mr->next_map_set); 707 + kfree(mr->map); 708 + } 640 709 }
+7 -12
drivers/infiniband/sw/rxe/rxe_mw.c
··· 33 33 RXE_MW_STATE_FREE : RXE_MW_STATE_VALID; 34 34 spin_lock_init(&mw->lock); 35 35 36 + rxe_finalize(mw); 37 + 36 38 return 0; 37 39 } 38 40 ··· 42 40 { 43 41 struct rxe_mw *mw = to_rmw(ibmw); 44 42 45 - rxe_put(mw); 43 + rxe_cleanup(mw); 46 44 47 45 return 0; 48 46 } ··· 50 48 static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, 51 49 struct rxe_mw *mw, struct rxe_mr *mr) 52 50 { 53 - u32 key = wqe->wr.wr.mw.rkey & 0xff; 54 - 55 51 if (mw->ibmw.type == IB_MW_TYPE_1) { 56 52 if (unlikely(mw->state != RXE_MW_STATE_VALID)) { 57 53 pr_err_once( ··· 87 87 } 88 88 } 89 89 90 - if (unlikely(key == (mw->rkey & 0xff))) { 91 - pr_err_once("attempt to bind MW with same key\n"); 92 - return -EINVAL; 93 - } 94 - 95 90 /* remaining checks only apply to a nonzero MR */ 96 91 if (!mr) 97 92 return 0; ··· 108 113 (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && 109 114 !(mr->access & IB_ACCESS_LOCAL_WRITE))) { 110 115 pr_err_once( 111 - "attempt to bind an writeable MW to an MR without local write access\n"); 116 + "attempt to bind an Writable MW to an MR without local write access\n"); 112 117 return -EINVAL; 113 118 } 114 119 115 120 /* C10-75 */ 116 121 if (mw->access & IB_ZERO_BASED) { 117 - if (unlikely(wqe->wr.wr.mw.length > mr->cur_map_set->length)) { 122 + if (unlikely(wqe->wr.wr.mw.length > mr->length)) { 118 123 pr_err_once( 119 124 "attempt to bind a ZB MW outside of the MR\n"); 120 125 return -EINVAL; 121 126 } 122 127 } else { 123 - if (unlikely((wqe->wr.wr.mw.addr < mr->cur_map_set->iova) || 128 + if (unlikely((wqe->wr.wr.mw.addr < mr->iova) || 124 129 ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) > 125 - (mr->cur_map_set->iova + mr->cur_map_set->length)))) { 130 + (mr->iova + mr->length)))) { 126 131 pr_err_once( 127 132 "attempt to bind a VA MW outside of the MR\n"); 128 133 return -EINVAL;
+6
drivers/infiniband/sw/rxe/rxe_param.h
··· 105 105 RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, 106 106 RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, 107 107 108 + /* Max number of interations of each tasklet 109 + * before yielding the cpu to let other 110 + * work make progress 111 + */ 112 + RXE_MAX_ITERATIONS = 1024, 113 + 108 114 /* Delay before calling arbiter timer */ 109 115 RXE_NSEC_ARB_TIMER_DELAY = 200, 110 116
+92 -12
drivers/infiniband/sw/rxe/rxe_pool.c
··· 6 6 7 7 #include "rxe.h" 8 8 9 + #define RXE_POOL_TIMEOUT (200) 9 10 #define RXE_POOL_ALIGN (16) 10 11 11 12 static const struct rxe_type_info { ··· 137 136 elem->pool = pool; 138 137 elem->obj = obj; 139 138 kref_init(&elem->ref_cnt); 139 + init_completion(&elem->complete); 140 140 141 - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, 141 + /* allocate index in array but leave pointer as NULL so it 142 + * can't be looked up until rxe_finalize() is called 143 + */ 144 + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, 142 145 &pool->next, GFP_KERNEL); 143 - if (err) 146 + if (err < 0) 144 147 goto err_free; 145 148 146 149 return obj; ··· 156 151 return NULL; 157 152 } 158 153 159 - int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) 154 + int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, 155 + bool sleepable) 160 156 { 161 157 int err; 158 + gfp_t gfp_flags; 162 159 163 160 if (WARN_ON(pool->type == RXE_TYPE_MR)) 164 161 return -EINVAL; ··· 171 164 elem->pool = pool; 172 165 elem->obj = (u8 *)elem - pool->elem_offset; 173 166 kref_init(&elem->ref_cnt); 167 + init_completion(&elem->complete); 174 168 175 - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, 176 - &pool->next, GFP_KERNEL); 177 - if (err) 169 + /* AH objects are unique in that the create_ah verb 170 + * can be called in atomic context. If the create_ah 171 + * call is not sleepable use GFP_ATOMIC. 172 + */ 173 + gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC; 174 + 175 + if (sleepable) 176 + might_sleep(); 177 + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, 178 + &pool->next, gfp_flags); 179 + if (err < 0) 178 180 goto err_cnt; 179 181 180 182 return 0; ··· 197 181 { 198 182 struct rxe_pool_elem *elem; 199 183 struct xarray *xa = &pool->xa; 200 - unsigned long flags; 201 184 void *obj; 202 185 203 - xa_lock_irqsave(xa, flags); 186 + rcu_read_lock(); 204 187 elem = xa_load(xa, index); 205 188 if (elem && kref_get_unless_zero(&elem->ref_cnt)) 206 189 obj = elem->obj; 207 190 else 208 191 obj = NULL; 209 - xa_unlock_irqrestore(xa, flags); 192 + rcu_read_unlock(); 210 193 211 194 return obj; 212 195 } ··· 213 198 static void rxe_elem_release(struct kref *kref) 214 199 { 215 200 struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt); 216 - struct rxe_pool *pool = elem->pool; 217 201 218 - xa_erase(&pool->xa, elem->index); 202 + complete(&elem->complete); 203 + } 204 + 205 + int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) 206 + { 207 + struct rxe_pool *pool = elem->pool; 208 + struct xarray *xa = &pool->xa; 209 + static int timeout = RXE_POOL_TIMEOUT; 210 + int ret, err = 0; 211 + void *xa_ret; 212 + 213 + if (sleepable) 214 + might_sleep(); 215 + 216 + /* erase xarray entry to prevent looking up 217 + * the pool elem from its index 218 + */ 219 + xa_ret = xa_erase(xa, elem->index); 220 + WARN_ON(xa_err(xa_ret)); 221 + 222 + /* if this is the last call to rxe_put complete the 223 + * object. It is safe to touch obj->elem after this since 224 + * it is freed below 225 + */ 226 + __rxe_put(elem); 227 + 228 + /* wait until all references to the object have been 229 + * dropped before final object specific cleanup and 230 + * return to rdma-core 231 + */ 232 + if (sleepable) { 233 + if (!completion_done(&elem->complete) && timeout) { 234 + ret = wait_for_completion_timeout(&elem->complete, 235 + timeout); 236 + 237 + /* Shouldn't happen. There are still references to 238 + * the object but, rather than deadlock, free the 239 + * object or pass back to rdma-core. 240 + */ 241 + if (WARN_ON(!ret)) 242 + err = -EINVAL; 243 + } 244 + } else { 245 + unsigned long until = jiffies + timeout; 246 + 247 + /* AH objects are unique in that the destroy_ah verb 248 + * can be called in atomic context. This delay 249 + * replaces the wait_for_completion call above 250 + * when the destroy_ah call is not sleepable 251 + */ 252 + while (!completion_done(&elem->complete) && 253 + time_before(jiffies, until)) 254 + mdelay(1); 255 + 256 + if (WARN_ON(!completion_done(&elem->complete))) 257 + err = -EINVAL; 258 + } 219 259 220 260 if (pool->cleanup) 221 261 pool->cleanup(elem); 222 262 223 263 if (pool->type == RXE_TYPE_MR) 224 - kfree(elem->obj); 264 + kfree_rcu(elem->obj); 225 265 226 266 atomic_dec(&pool->num_elem); 267 + 268 + return err; 227 269 } 228 270 229 271 int __rxe_get(struct rxe_pool_elem *elem) ··· 291 219 int __rxe_put(struct rxe_pool_elem *elem) 292 220 { 293 221 return kref_put(&elem->ref_cnt, rxe_elem_release); 222 + } 223 + 224 + void __rxe_finalize(struct rxe_pool_elem *elem) 225 + { 226 + void *xa_ret; 227 + 228 + xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL); 229 + WARN_ON(xa_err(xa_ret)); 294 230 }
+13 -5
drivers/infiniband/sw/rxe/rxe_pool.h
··· 24 24 void *obj; 25 25 struct kref ref_cnt; 26 26 struct list_head list; 27 + struct completion complete; 27 28 u32 index; 28 29 }; 29 30 ··· 58 57 void *rxe_alloc(struct rxe_pool *pool); 59 58 60 59 /* connect already allocated object to pool */ 61 - int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); 62 - 63 - #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem) 60 + int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, 61 + bool sleepable); 62 + #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true) 63 + #define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \ 64 + &(obj)->elem, sleepable) 64 65 65 66 /* lookup an indexed object from index. takes a reference on object */ 66 67 void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); 67 68 68 69 int __rxe_get(struct rxe_pool_elem *elem); 69 - 70 70 #define rxe_get(obj) __rxe_get(&(obj)->elem) 71 71 72 72 int __rxe_put(struct rxe_pool_elem *elem); 73 - 74 73 #define rxe_put(obj) __rxe_put(&(obj)->elem) 75 74 75 + int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable); 76 + #define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true) 77 + #define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable) 78 + 76 79 #define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt) 80 + 81 + void __rxe_finalize(struct rxe_pool_elem *elem); 82 + #define rxe_finalize(obj) __rxe_finalize(&(obj)->elem) 77 83 78 84 #endif /* RXE_POOL_H */
+23 -13
drivers/infiniband/sw/rxe/rxe_qp.c
··· 120 120 for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { 121 121 struct resp_res *res = &qp->resp.resources[i]; 122 122 123 - free_rd_atomic_resource(qp, res); 123 + free_rd_atomic_resource(res); 124 124 } 125 125 kfree(qp->resp.resources); 126 126 qp->resp.resources = NULL; 127 127 } 128 128 } 129 129 130 - void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) 130 + void free_rd_atomic_resource(struct resp_res *res) 131 131 { 132 - if (res->type == RXE_ATOMIC_MASK) 133 - kfree_skb(res->atomic.skb); 134 132 res->type = 0; 135 133 } 136 134 ··· 140 142 if (qp->resp.resources) { 141 143 for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { 142 144 res = &qp->resp.resources[i]; 143 - free_rd_atomic_resource(qp, res); 145 + free_rd_atomic_resource(res); 144 146 } 145 147 } 146 148 } ··· 171 173 } 172 174 173 175 spin_lock_init(&qp->state_lock); 176 + 177 + spin_lock_init(&qp->req.task.state_lock); 178 + spin_lock_init(&qp->resp.task.state_lock); 179 + spin_lock_init(&qp->comp.task.state_lock); 180 + 181 + spin_lock_init(&qp->sq.sq_lock); 182 + spin_lock_init(&qp->rq.producer_lock); 183 + spin_lock_init(&qp->rq.consumer_lock); 174 184 175 185 atomic_set(&qp->ssn, 0); 176 186 atomic_set(&qp->skb_out, 0); ··· 236 230 QUEUE_TYPE_FROM_CLIENT); 237 231 238 232 qp->req.state = QP_STATE_RESET; 233 + qp->comp.state = QP_STATE_RESET; 239 234 qp->req.opcode = -1; 240 235 qp->comp.opcode = -1; 241 236 242 - spin_lock_init(&qp->sq.sq_lock); 243 237 skb_queue_head_init(&qp->req_pkts); 244 238 245 239 rxe_init_task(rxe, &qp->req.task, qp, ··· 289 283 return err; 290 284 } 291 285 } 292 - 293 - spin_lock_init(&qp->rq.producer_lock); 294 - spin_lock_init(&qp->rq.consumer_lock); 295 286 296 287 skb_queue_head_init(&qp->resp_pkts); 297 288 ··· 493 490 494 491 /* move qp to the reset state */ 495 492 qp->req.state = QP_STATE_RESET; 493 + qp->comp.state = QP_STATE_RESET; 496 494 qp->resp.state = QP_STATE_RESET; 497 495 498 496 /* let state machines reset themselves drain work and packet queues ··· 511 507 atomic_set(&qp->ssn, 0); 512 508 qp->req.opcode = -1; 513 509 qp->req.need_retry = 0; 510 + qp->req.wait_for_rnr_timer = 0; 514 511 qp->req.noack_pkts = 0; 515 512 qp->resp.msn = 0; 516 513 qp->resp.opcode = -1; ··· 557 552 { 558 553 qp->req.state = QP_STATE_ERROR; 559 554 qp->resp.state = QP_STATE_ERROR; 555 + qp->comp.state = QP_STATE_ERROR; 560 556 qp->attr.qp_state = IB_QPS_ERR; 561 557 562 558 /* drain work and packet queues */ ··· 695 689 pr_debug("qp#%d state -> INIT\n", qp_num(qp)); 696 690 qp->req.state = QP_STATE_INIT; 697 691 qp->resp.state = QP_STATE_INIT; 692 + qp->comp.state = QP_STATE_INIT; 698 693 break; 699 694 700 695 case IB_QPS_RTR: ··· 706 699 case IB_QPS_RTS: 707 700 pr_debug("qp#%d state -> RTS\n", qp_num(qp)); 708 701 qp->req.state = QP_STATE_READY; 702 + qp->comp.state = QP_STATE_READY; 709 703 break; 710 704 711 705 case IB_QPS_SQD: ··· 812 804 if (qp->rq.queue) 813 805 rxe_queue_cleanup(qp->rq.queue); 814 806 815 - atomic_dec(&qp->scq->num_wq); 816 - if (qp->scq) 807 + if (qp->scq) { 808 + atomic_dec(&qp->scq->num_wq); 817 809 rxe_put(qp->scq); 810 + } 818 811 819 - atomic_dec(&qp->rcq->num_wq); 820 - if (qp->rcq) 812 + if (qp->rcq) { 813 + atomic_dec(&qp->rcq->num_wq); 821 814 rxe_put(qp->rcq); 815 + } 822 816 823 817 if (qp->pd) 824 818 rxe_put(qp->pd);
+2 -3
drivers/infiniband/sw/rxe/rxe_queue.h
··· 7 7 #ifndef RXE_QUEUE_H 8 8 #define RXE_QUEUE_H 9 9 10 - /* for definition of shared struct rxe_queue_buf */ 11 - #include <uapi/rdma/rdma_user_rxe.h> 12 - 13 10 /* Implements a simple circular buffer that is shared between user 14 11 * and the driver and can be resized. The requested element size is 15 12 * rounded up to a power of 2 and the number of elements in the buffer ··· 49 52 QUEUE_TYPE_TO_DRIVER, 50 53 QUEUE_TYPE_FROM_DRIVER, 51 54 }; 55 + 56 + struct rxe_queue_buf; 52 57 53 58 struct rxe_queue { 54 59 struct rxe_dev *rxe;
+98 -39
drivers/infiniband/sw/rxe/rxe_req.c
··· 15 15 u32 opcode); 16 16 17 17 static inline void retry_first_write_send(struct rxe_qp *qp, 18 - struct rxe_send_wqe *wqe, 19 - unsigned int mask, int npsn) 18 + struct rxe_send_wqe *wqe, int npsn) 20 19 { 21 20 int i; 22 21 ··· 82 83 if (mask & WR_WRITE_OR_SEND_MASK) { 83 84 npsn = (qp->comp.psn - wqe->first_psn) & 84 85 BTH_PSN_MASK; 85 - retry_first_write_send(qp, wqe, mask, npsn); 86 + retry_first_write_send(qp, wqe, npsn); 86 87 } 87 88 88 89 if (mask & WR_READ_MASK) { ··· 100 101 { 101 102 struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); 102 103 103 - pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp)); 104 + pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp)); 105 + 106 + /* request a send queue retry */ 107 + qp->req.need_retry = 1; 108 + qp->req.wait_for_rnr_timer = 0; 104 109 rxe_run_task(&qp->req.task, 1); 105 110 } 106 111 ··· 164 161 (wqe->state != wqe_state_processing))) 165 162 return NULL; 166 163 167 - if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) && 168 - (index != cons))) { 169 - qp->req.wait_fence = 1; 170 - return NULL; 171 - } 172 - 173 164 wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); 174 165 return wqe; 166 + } 167 + 168 + /** 169 + * rxe_wqe_is_fenced - check if next wqe is fenced 170 + * @qp: the queue pair 171 + * @wqe: the next wqe 172 + * 173 + * Returns: 1 if wqe needs to wait 174 + * 0 if wqe is ready to go 175 + */ 176 + static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe) 177 + { 178 + /* Local invalidate fence (LIF) see IBA 10.6.5.1 179 + * Requires ALL previous operations on the send queue 180 + * are complete. Make mandatory for the rxe driver. 181 + */ 182 + if (wqe->wr.opcode == IB_WR_LOCAL_INV) 183 + return qp->req.wqe_index != queue_get_consumer(qp->sq.queue, 184 + QUEUE_TYPE_FROM_CLIENT); 185 + 186 + /* Fence see IBA 10.8.3.3 187 + * Requires that all previous read and atomic operations 188 + * are complete. 189 + */ 190 + return (wqe->wr.send_flags & IB_SEND_FENCE) && 191 + atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic; 175 192 } 176 193 177 194 static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) ··· 604 581 wqe->status = IB_WC_SUCCESS; 605 582 qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); 606 583 607 - if ((wqe->wr.send_flags & IB_SEND_SIGNALED) || 608 - qp->sq_sig_type == IB_SIGNAL_ALL_WR) 609 - rxe_run_task(&qp->comp.task, 1); 584 + /* There is no ack coming for local work requests 585 + * which can lead to a deadlock. So go ahead and complete 586 + * it now. 587 + */ 588 + rxe_run_task(&qp->comp.task, 1); 610 589 611 590 return 0; 612 591 } ··· 624 599 u32 payload; 625 600 int mtu; 626 601 int opcode; 602 + int err; 627 603 int ret; 628 604 struct rxe_send_wqe rollback_wqe; 629 605 u32 rollback_psn; ··· 635 609 if (!rxe_get(qp)) 636 610 return -EAGAIN; 637 611 638 - next_wqe: 639 - if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) 612 + if (unlikely(!qp->valid)) 640 613 goto exit; 614 + 615 + if (unlikely(qp->req.state == QP_STATE_ERROR)) { 616 + wqe = req_next_wqe(qp); 617 + if (wqe) 618 + /* 619 + * Generate an error completion for error qp state 620 + */ 621 + goto err; 622 + else 623 + goto exit; 624 + } 641 625 642 626 if (unlikely(qp->req.state == QP_STATE_RESET)) { 643 627 qp->req.wqe_index = queue_get_consumer(q, ··· 656 620 qp->req.need_rd_atomic = 0; 657 621 qp->req.wait_psn = 0; 658 622 qp->req.need_retry = 0; 623 + qp->req.wait_for_rnr_timer = 0; 659 624 goto exit; 660 625 } 661 626 662 - if (unlikely(qp->req.need_retry)) { 627 + /* we come here if the retransmit timer has fired 628 + * or if the rnr timer has fired. If the retransmit 629 + * timer fires while we are processing an RNR NAK wait 630 + * until the rnr timer has fired before starting the 631 + * retry flow 632 + */ 633 + if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) { 663 634 req_retry(qp); 664 635 qp->req.need_retry = 0; 665 636 } ··· 675 632 if (unlikely(!wqe)) 676 633 goto exit; 677 634 635 + if (rxe_wqe_is_fenced(qp, wqe)) { 636 + qp->req.wait_fence = 1; 637 + goto exit; 638 + } 639 + 678 640 if (wqe->mask & WR_LOCAL_OP_MASK) { 679 - ret = rxe_do_local_ops(qp, wqe); 680 - if (unlikely(ret)) 641 + err = rxe_do_local_ops(qp, wqe); 642 + if (unlikely(err)) 681 643 goto err; 682 644 else 683 - goto next_wqe; 645 + goto done; 684 646 } 685 647 686 648 if (unlikely(qp_type(qp) == IB_QPT_RC && ··· 733 685 qp->req.wqe_index); 734 686 wqe->state = wqe_state_done; 735 687 wqe->status = IB_WC_SUCCESS; 736 - __rxe_do_task(&qp->comp.task); 737 - rxe_put(qp); 738 - return 0; 688 + rxe_run_task(&qp->comp.task, 0); 689 + goto done; 739 690 } 740 691 payload = mtu; 741 692 } ··· 750 703 if (unlikely(!av)) { 751 704 pr_err("qp#%d Failed no address vector\n", qp_num(qp)); 752 705 wqe->status = IB_WC_LOC_QP_OP_ERR; 753 - goto err_drop_ah; 706 + goto err; 754 707 } 755 708 756 709 skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); 757 710 if (unlikely(!skb)) { 758 711 pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); 759 712 wqe->status = IB_WC_LOC_QP_OP_ERR; 760 - goto err_drop_ah; 713 + if (ah) 714 + rxe_put(ah); 715 + goto err; 761 716 } 762 717 763 - ret = finish_packet(qp, av, wqe, &pkt, skb, payload); 764 - if (unlikely(ret)) { 718 + err = finish_packet(qp, av, wqe, &pkt, skb, payload); 719 + if (unlikely(err)) { 765 720 pr_debug("qp#%d Error during finish packet\n", qp_num(qp)); 766 - if (ret == -EFAULT) 721 + if (err == -EFAULT) 767 722 wqe->status = IB_WC_LOC_PROT_ERR; 768 723 else 769 724 wqe->status = IB_WC_LOC_QP_OP_ERR; 770 725 kfree_skb(skb); 771 - goto err_drop_ah; 726 + if (ah) 727 + rxe_put(ah); 728 + goto err; 772 729 } 773 730 774 731 if (ah) ··· 787 736 save_state(wqe, qp, &rollback_wqe, &rollback_psn); 788 737 update_wqe_state(qp, wqe, &pkt); 789 738 update_wqe_psn(qp, wqe, &pkt, payload); 790 - ret = rxe_xmit_packet(qp, &pkt, skb); 791 - if (ret) { 739 + 740 + err = rxe_xmit_packet(qp, &pkt, skb); 741 + if (err) { 792 742 qp->need_req_skb = 1; 793 743 794 744 rollback_state(wqe, qp, &rollback_wqe, rollback_psn); 795 745 796 - if (ret == -EAGAIN) { 746 + if (err == -EAGAIN) { 797 747 rxe_run_task(&qp->req.task, 1); 798 748 goto exit; 799 749 } ··· 805 753 806 754 update_state(qp, &pkt); 807 755 808 - goto next_wqe; 809 - 810 - err_drop_ah: 811 - if (ah) 812 - rxe_put(ah); 756 + /* A non-zero return value will cause rxe_do_task to 757 + * exit its loop and end the tasklet. A zero return 758 + * will continue looping and return to rxe_requester 759 + */ 760 + done: 761 + ret = 0; 762 + goto out; 813 763 err: 764 + /* update wqe_index for each wqe completion */ 765 + qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); 814 766 wqe->state = wqe_state_error; 815 - __rxe_do_task(&qp->comp.task); 816 - 767 + qp->req.state = QP_STATE_ERROR; 768 + rxe_run_task(&qp->comp.task, 0); 817 769 exit: 770 + ret = -EAGAIN; 771 + out: 818 772 rxe_put(qp); 819 - return -EAGAIN; 773 + 774 + return ret; 820 775 }
+133 -109
drivers/infiniband/sw/rxe/rxe_resp.c
··· 21 21 RESPST_CHK_RKEY, 22 22 RESPST_EXECUTE, 23 23 RESPST_READ_REPLY, 24 + RESPST_ATOMIC_REPLY, 24 25 RESPST_COMPLETE, 25 26 RESPST_ACKNOWLEDGE, 26 27 RESPST_CLEANUP, ··· 56 55 [RESPST_CHK_RKEY] = "CHK_RKEY", 57 56 [RESPST_EXECUTE] = "EXECUTE", 58 57 [RESPST_READ_REPLY] = "READ_REPLY", 58 + [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", 59 59 [RESPST_COMPLETE] = "COMPLETE", 60 60 [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", 61 61 [RESPST_CLEANUP] = "CLEANUP", ··· 450 448 if (rkey_is_mw(rkey)) { 451 449 mw = rxe_lookup_mw(qp, access, rkey); 452 450 if (!mw) { 453 - pr_err("%s: no MW matches rkey %#x\n", __func__, rkey); 451 + pr_debug("%s: no MW matches rkey %#x\n", 452 + __func__, rkey); 454 453 state = RESPST_ERR_RKEY_VIOLATION; 455 454 goto err; 456 455 } ··· 471 468 } else { 472 469 mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); 473 470 if (!mr) { 474 - pr_err("%s: no MR matches rkey %#x\n", __func__, rkey); 471 + pr_debug("%s: no MR matches rkey %#x\n", 472 + __func__, rkey); 475 473 state = RESPST_ERR_RKEY_VIOLATION; 476 474 goto err; 477 475 } ··· 553 549 return rc; 554 550 } 555 551 552 + static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, 553 + struct rxe_pkt_info *pkt, 554 + int type) 555 + { 556 + struct resp_res *res; 557 + u32 pkts; 558 + 559 + res = &qp->resp.resources[qp->resp.res_head]; 560 + rxe_advance_resp_resource(qp); 561 + free_rd_atomic_resource(res); 562 + 563 + res->type = type; 564 + res->replay = 0; 565 + 566 + switch (type) { 567 + case RXE_READ_MASK: 568 + res->read.va = qp->resp.va + qp->resp.offset; 569 + res->read.va_org = qp->resp.va + qp->resp.offset; 570 + res->read.resid = qp->resp.resid; 571 + res->read.length = qp->resp.resid; 572 + res->read.rkey = qp->resp.rkey; 573 + 574 + pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); 575 + res->first_psn = pkt->psn; 576 + res->cur_psn = pkt->psn; 577 + res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; 578 + 579 + res->state = rdatm_res_state_new; 580 + break; 581 + case RXE_ATOMIC_MASK: 582 + res->first_psn = pkt->psn; 583 + res->last_psn = pkt->psn; 584 + res->cur_psn = pkt->psn; 585 + break; 586 + } 587 + 588 + return res; 589 + } 590 + 556 591 /* Guarantee atomicity of atomic operations at the machine level. */ 557 592 static DEFINE_SPINLOCK(atomic_ops_lock); 558 593 559 - static enum resp_states process_atomic(struct rxe_qp *qp, 560 - struct rxe_pkt_info *pkt) 594 + static enum resp_states atomic_reply(struct rxe_qp *qp, 595 + struct rxe_pkt_info *pkt) 561 596 { 562 597 u64 *vaddr; 563 598 enum resp_states ret; 564 599 struct rxe_mr *mr = qp->resp.mr; 600 + struct resp_res *res = qp->resp.res; 601 + u64 value; 565 602 566 - if (mr->state != RXE_MR_STATE_VALID) { 567 - ret = RESPST_ERR_RKEY_VIOLATION; 568 - goto out; 603 + if (!res) { 604 + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); 605 + qp->resp.res = res; 569 606 } 570 607 571 - vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64)); 608 + if (!res->replay) { 609 + if (mr->state != RXE_MR_STATE_VALID) { 610 + ret = RESPST_ERR_RKEY_VIOLATION; 611 + goto out; 612 + } 572 613 573 - /* check vaddr is 8 bytes aligned. */ 574 - if (!vaddr || (uintptr_t)vaddr & 7) { 575 - ret = RESPST_ERR_MISALIGNED_ATOMIC; 576 - goto out; 614 + vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, 615 + sizeof(u64)); 616 + 617 + /* check vaddr is 8 bytes aligned. */ 618 + if (!vaddr || (uintptr_t)vaddr & 7) { 619 + ret = RESPST_ERR_MISALIGNED_ATOMIC; 620 + goto out; 621 + } 622 + 623 + spin_lock_bh(&atomic_ops_lock); 624 + res->atomic.orig_val = value = *vaddr; 625 + 626 + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { 627 + if (value == atmeth_comp(pkt)) 628 + value = atmeth_swap_add(pkt); 629 + } else { 630 + value += atmeth_swap_add(pkt); 631 + } 632 + 633 + *vaddr = value; 634 + spin_unlock_bh(&atomic_ops_lock); 635 + 636 + qp->resp.msn++; 637 + 638 + /* next expected psn, read handles this separately */ 639 + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; 640 + qp->resp.ack_psn = qp->resp.psn; 641 + 642 + qp->resp.opcode = pkt->opcode; 643 + qp->resp.status = IB_WC_SUCCESS; 577 644 } 578 645 579 - spin_lock_bh(&atomic_ops_lock); 580 - 581 - qp->resp.atomic_orig = *vaddr; 582 - 583 - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { 584 - if (*vaddr == atmeth_comp(pkt)) 585 - *vaddr = atmeth_swap_add(pkt); 586 - } else { 587 - *vaddr += atmeth_swap_add(pkt); 588 - } 589 - 590 - spin_unlock_bh(&atomic_ops_lock); 591 - 592 - ret = RESPST_NONE; 646 + ret = RESPST_ACKNOWLEDGE; 593 647 out: 594 648 return ret; 595 649 } 596 650 597 651 static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, 598 - struct rxe_pkt_info *pkt, 599 652 struct rxe_pkt_info *ack, 600 653 int opcode, 601 654 int payload, ··· 690 629 } 691 630 692 631 if (ack->mask & RXE_ATMACK_MASK) 693 - atmack_set_orig(ack, qp->resp.atomic_orig); 632 + atmack_set_orig(ack, qp->resp.res->atomic.orig_val); 694 633 695 634 err = rxe_prepare(&qp->pri_av, ack, skb); 696 635 if (err) { ··· 699 638 } 700 639 701 640 return skb; 702 - } 703 - 704 - static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp, 705 - struct rxe_pkt_info *pkt) 706 - { 707 - struct resp_res *res; 708 - u32 pkts; 709 - 710 - res = &qp->resp.resources[qp->resp.res_head]; 711 - rxe_advance_resp_resource(qp); 712 - free_rd_atomic_resource(qp, res); 713 - 714 - res->type = RXE_READ_MASK; 715 - res->replay = 0; 716 - res->read.va = qp->resp.va + qp->resp.offset; 717 - res->read.va_org = qp->resp.va + qp->resp.offset; 718 - res->read.resid = qp->resp.resid; 719 - res->read.length = qp->resp.resid; 720 - res->read.rkey = qp->resp.rkey; 721 - 722 - pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); 723 - res->first_psn = pkt->psn; 724 - res->cur_psn = pkt->psn; 725 - res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; 726 - 727 - res->state = rdatm_res_state_new; 728 - 729 - return res; 730 641 } 731 642 732 643 /** ··· 771 738 struct rxe_mr *mr; 772 739 773 740 if (!res) { 774 - res = rxe_prepare_read_res(qp, req_pkt); 741 + res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); 775 742 qp->resp.res = res; 776 743 } 777 744 ··· 804 771 805 772 payload = min_t(int, res->read.resid, mtu); 806 773 807 - skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, 774 + skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, 808 775 res->cur_psn, AETH_ACK_UNLIMITED); 809 776 if (!skb) 810 777 return RESPST_ERR_RNR; ··· 891 858 qp->resp.msn++; 892 859 return RESPST_READ_REPLY; 893 860 } else if (pkt->mask & RXE_ATOMIC_MASK) { 894 - err = process_atomic(qp, pkt); 895 - if (err) 896 - return err; 861 + return RESPST_ATOMIC_REPLY; 897 862 } else { 898 863 /* Unreachable */ 899 864 WARN_ON_ONCE(1); ··· 1028 997 return RESPST_CLEANUP; 1029 998 } 1030 999 1031 - static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, 1032 - u8 syndrome, u32 psn) 1000 + static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) 1033 1001 { 1034 1002 int err = 0; 1035 1003 struct rxe_pkt_info ack_pkt; 1036 1004 struct sk_buff *skb; 1037 1005 1038 - skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, 1006 + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, 1039 1007 0, psn, syndrome); 1040 1008 if (!skb) { 1041 1009 err = -ENOMEM; ··· 1049 1019 return err; 1050 1020 } 1051 1021 1052 - static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, 1053 - u8 syndrome) 1022 + static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) 1054 1023 { 1055 - int rc = 0; 1024 + int err = 0; 1056 1025 struct rxe_pkt_info ack_pkt; 1057 1026 struct sk_buff *skb; 1058 - struct resp_res *res; 1059 1027 1060 - skb = prepare_ack_packet(qp, pkt, &ack_pkt, 1061 - IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, 1062 - syndrome); 1028 + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 1029 + 0, psn, syndrome); 1063 1030 if (!skb) { 1064 - rc = -ENOMEM; 1031 + err = -ENOMEM; 1065 1032 goto out; 1066 1033 } 1067 1034 1068 - res = &qp->resp.resources[qp->resp.res_head]; 1069 - free_rd_atomic_resource(qp, res); 1070 - rxe_advance_resp_resource(qp); 1035 + err = rxe_xmit_packet(qp, &ack_pkt, skb); 1036 + if (err) 1037 + pr_err_ratelimited("Failed sending atomic ack\n"); 1071 1038 1072 - skb_get(skb); 1073 - res->type = RXE_ATOMIC_MASK; 1074 - res->atomic.skb = skb; 1075 - res->first_psn = ack_pkt.psn; 1076 - res->last_psn = ack_pkt.psn; 1077 - res->cur_psn = ack_pkt.psn; 1078 - 1079 - rc = rxe_xmit_packet(qp, &ack_pkt, skb); 1080 - if (rc) { 1081 - pr_err_ratelimited("Failed sending ack\n"); 1082 - rxe_put(qp); 1083 - } 1039 + /* have to clear this since it is used to trigger 1040 + * long read replies 1041 + */ 1042 + qp->resp.res = NULL; 1084 1043 out: 1085 - return rc; 1044 + return err; 1086 1045 } 1087 1046 1088 1047 static enum resp_states acknowledge(struct rxe_qp *qp, ··· 1081 1062 return RESPST_CLEANUP; 1082 1063 1083 1064 if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) 1084 - send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); 1065 + send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); 1085 1066 else if (pkt->mask & RXE_ATOMIC_MASK) 1086 - send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); 1067 + send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); 1087 1068 else if (bth_ack(pkt)) 1088 - send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); 1069 + send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); 1089 1070 1090 1071 return RESPST_CLEANUP; 1091 1072 } ··· 1138 1119 if (pkt->mask & RXE_SEND_MASK || 1139 1120 pkt->mask & RXE_WRITE_MASK) { 1140 1121 /* SEND. Ack again and cleanup. C9-105. */ 1141 - send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn); 1122 + send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); 1142 1123 return RESPST_CLEANUP; 1143 1124 } else if (pkt->mask & RXE_READ_MASK) { 1144 1125 struct resp_res *res; ··· 1192 1173 /* Find the operation in our list of responder resources. */ 1193 1174 res = find_resource(qp, pkt->psn); 1194 1175 if (res) { 1195 - skb_get(res->atomic.skb); 1196 - /* Resend the result. */ 1197 - rc = rxe_xmit_packet(qp, pkt, res->atomic.skb); 1198 - if (rc) { 1199 - pr_err("Failed resending result. This flow is not handled - skb ignored\n"); 1200 - rc = RESPST_CLEANUP; 1201 - goto out; 1202 - } 1176 + res->replay = 1; 1177 + res->cur_psn = pkt->psn; 1178 + qp->resp.res = res; 1179 + rc = RESPST_ATOMIC_REPLY; 1180 + goto out; 1203 1181 } 1204 1182 1205 1183 /* Resource not found. Class D error. Drop the request. */ ··· 1276 1260 struct rxe_dev *rxe = to_rdev(qp->ibqp.device); 1277 1261 enum resp_states state; 1278 1262 struct rxe_pkt_info *pkt = NULL; 1279 - int ret = 0; 1263 + int ret; 1280 1264 1281 1265 if (!rxe_get(qp)) 1282 1266 return -EAGAIN; 1283 1267 1284 1268 qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; 1285 1269 1286 - if (!qp->valid) { 1287 - ret = -EINVAL; 1288 - goto done; 1289 - } 1270 + if (!qp->valid) 1271 + goto exit; 1290 1272 1291 1273 switch (qp->resp.state) { 1292 1274 case QP_STATE_RESET: ··· 1330 1316 case RESPST_READ_REPLY: 1331 1317 state = read_reply(qp, pkt); 1332 1318 break; 1319 + case RESPST_ATOMIC_REPLY: 1320 + state = atomic_reply(qp, pkt); 1321 + break; 1333 1322 case RESPST_ACKNOWLEDGE: 1334 1323 state = acknowledge(qp, pkt); 1335 1324 break; ··· 1344 1327 break; 1345 1328 case RESPST_ERR_PSN_OUT_OF_SEQ: 1346 1329 /* RC only - Class B. Drop packet. */ 1347 - send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); 1330 + send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); 1348 1331 state = RESPST_CLEANUP; 1349 1332 break; 1350 1333 ··· 1366 1349 if (qp_type(qp) == IB_QPT_RC) { 1367 1350 rxe_counter_inc(rxe, RXE_CNT_SND_RNR); 1368 1351 /* RC - class B */ 1369 - send_ack(qp, pkt, AETH_RNR_NAK | 1352 + send_ack(qp, AETH_RNR_NAK | 1370 1353 (~AETH_TYPE_MASK & 1371 1354 qp->attr.min_rnr_timer), 1372 1355 pkt->psn); ··· 1455 1438 1456 1439 case RESPST_ERROR: 1457 1440 qp->resp.goto_error = 0; 1458 - pr_warn("qp#%d moved to error state\n", qp_num(qp)); 1441 + pr_debug("qp#%d moved to error state\n", qp_num(qp)); 1459 1442 rxe_qp_error(qp); 1460 1443 goto exit; 1461 1444 ··· 1464 1447 } 1465 1448 } 1466 1449 1450 + /* A non-zero return value will cause rxe_do_task to 1451 + * exit its loop and end the tasklet. A zero return 1452 + * will continue looping and return to rxe_responder 1453 + */ 1454 + done: 1455 + ret = 0; 1456 + goto out; 1467 1457 exit: 1468 1458 ret = -EAGAIN; 1469 - done: 1459 + out: 1470 1460 rxe_put(qp); 1471 1461 return ret; 1472 1462 }
+12 -4
drivers/infiniband/sw/rxe/rxe_task.c
··· 8 8 #include <linux/interrupt.h> 9 9 #include <linux/hardirq.h> 10 10 11 - #include "rxe_task.h" 11 + #include "rxe.h" 12 12 13 13 int __rxe_do_task(struct rxe_task *task) 14 14 ··· 33 33 int cont; 34 34 int ret; 35 35 struct rxe_task *task = from_tasklet(task, t, tasklet); 36 + unsigned int iterations = RXE_MAX_ITERATIONS; 36 37 37 38 spin_lock_bh(&task->state_lock); 38 39 switch (task->state) { ··· 62 61 spin_lock_bh(&task->state_lock); 63 62 switch (task->state) { 64 63 case TASK_STATE_BUSY: 65 - if (ret) 64 + if (ret) { 66 65 task->state = TASK_STATE_START; 67 - else 66 + } else if (iterations--) { 68 67 cont = 1; 68 + } else { 69 + /* reschedule the tasklet and exit 70 + * the loop to give up the cpu 71 + */ 72 + tasklet_schedule(&task->tasklet); 73 + task->state = TASK_STATE_START; 74 + } 69 75 break; 70 76 71 - /* soneone tried to run the task since the last time we called 77 + /* someone tried to run the task since the last time we called 72 78 * func, so we will call one more time regardless of the 73 79 * return value 74 80 */
+52 -26
drivers/infiniband/sw/rxe/rxe_verbs.c
··· 115 115 { 116 116 struct rxe_ucontext *uc = to_ruc(ibuc); 117 117 118 - rxe_put(uc); 118 + rxe_cleanup(uc); 119 119 } 120 120 121 121 static int rxe_port_immutable(struct ib_device *dev, u32 port_num, ··· 149 149 { 150 150 struct rxe_pd *pd = to_rpd(ibpd); 151 151 152 - rxe_put(pd); 152 + rxe_cleanup(pd); 153 153 return 0; 154 154 } 155 155 ··· 176 176 if (err) 177 177 return err; 178 178 179 - err = rxe_add_to_pool(&rxe->ah_pool, ah); 179 + err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, 180 + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); 180 181 if (err) 181 182 return err; 182 183 ··· 189 188 err = copy_to_user(&uresp->ah_num, &ah->ah_num, 190 189 sizeof(uresp->ah_num)); 191 190 if (err) { 192 - rxe_put(ah); 191 + rxe_cleanup(ah); 193 192 return -EFAULT; 194 193 } 195 194 } else if (ah->is_user) { ··· 198 197 } 199 198 200 199 rxe_init_av(init_attr->ah_attr, &ah->av); 200 + rxe_finalize(ah); 201 + 201 202 return 0; 202 203 } 203 204 ··· 231 228 { 232 229 struct rxe_ah *ah = to_rah(ibah); 233 230 234 - rxe_put(ah); 231 + rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); 232 + 235 233 return 0; 236 234 } 237 235 ··· 312 308 313 309 err = rxe_srq_from_init(rxe, srq, init, udata, uresp); 314 310 if (err) 315 - goto err_put; 311 + goto err_cleanup; 316 312 317 313 return 0; 318 314 319 - err_put: 320 - rxe_put(srq); 315 + err_cleanup: 316 + rxe_cleanup(srq); 317 + 321 318 return err; 322 319 } 323 320 ··· 367 362 { 368 363 struct rxe_srq *srq = to_rsrq(ibsrq); 369 364 370 - rxe_put(srq); 365 + rxe_cleanup(srq); 371 366 return 0; 372 367 } 373 368 ··· 434 429 if (err) 435 430 goto qp_init; 436 431 432 + rxe_finalize(qp); 437 433 return 0; 438 434 439 435 qp_init: 440 - rxe_put(qp); 436 + rxe_cleanup(qp); 441 437 return err; 442 438 } 443 439 ··· 491 485 if (ret) 492 486 return ret; 493 487 494 - rxe_put(qp); 488 + rxe_cleanup(qp); 495 489 return 0; 496 490 } 497 491 ··· 809 803 810 804 rxe_cq_disable(cq); 811 805 812 - rxe_put(cq); 806 + rxe_cleanup(cq); 813 807 return 0; 814 808 } 815 809 ··· 904 898 905 899 rxe_get(pd); 906 900 rxe_mr_init_dma(pd, access, mr); 901 + rxe_finalize(mr); 907 902 908 903 return &mr->ibmr; 909 904 } ··· 933 926 if (err) 934 927 goto err3; 935 928 929 + rxe_finalize(mr); 930 + 936 931 return &mr->ibmr; 937 932 938 933 err3: 939 934 rxe_put(pd); 940 - rxe_put(mr); 935 + rxe_cleanup(mr); 941 936 err2: 942 937 return ERR_PTR(err); 943 938 } ··· 967 958 if (err) 968 959 goto err2; 969 960 961 + rxe_finalize(mr); 962 + 970 963 return &mr->ibmr; 971 964 972 965 err2: 973 966 rxe_put(pd); 974 - rxe_put(mr); 967 + rxe_cleanup(mr); 975 968 err1: 976 969 return ERR_PTR(err); 977 970 } 978 971 979 - /* build next_map_set from scatterlist 980 - * The IB_WR_REG_MR WR will swap map_sets 981 - */ 972 + static int rxe_set_page(struct ib_mr *ibmr, u64 addr) 973 + { 974 + struct rxe_mr *mr = to_rmr(ibmr); 975 + struct rxe_map *map; 976 + struct rxe_phys_buf *buf; 977 + 978 + if (unlikely(mr->nbuf == mr->num_buf)) 979 + return -ENOMEM; 980 + 981 + map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; 982 + buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; 983 + 984 + buf->addr = addr; 985 + buf->size = ibmr->page_size; 986 + mr->nbuf++; 987 + 988 + return 0; 989 + } 990 + 982 991 static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, 983 992 int sg_nents, unsigned int *sg_offset) 984 993 { 985 994 struct rxe_mr *mr = to_rmr(ibmr); 986 - struct rxe_map_set *set = mr->next_map_set; 987 995 int n; 988 996 989 - set->nbuf = 0; 997 + mr->nbuf = 0; 990 998 991 - n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_mr_set_page); 999 + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); 992 1000 993 - set->va = ibmr->iova; 994 - set->iova = ibmr->iova; 995 - set->length = ibmr->length; 996 - set->page_shift = ilog2(ibmr->page_size); 997 - set->page_mask = ibmr->page_size - 1; 998 - set->offset = set->iova & set->page_mask; 1001 + mr->va = ibmr->iova; 1002 + mr->iova = ibmr->iova; 1003 + mr->length = ibmr->length; 1004 + mr->page_shift = ilog2(ibmr->page_size); 1005 + mr->page_mask = ibmr->page_size - 1; 1006 + mr->offset = mr->iova & mr->page_mask; 999 1007 1000 1008 return n; 1001 1009 }
+11 -16
drivers/infiniband/sw/rxe/rxe_verbs.h
··· 9 9 10 10 #include <linux/interrupt.h> 11 11 #include <linux/workqueue.h> 12 - #include <rdma/rdma_user_rxe.h> 13 12 #include "rxe_pool.h" 14 13 #include "rxe_task.h" 15 14 #include "rxe_hw_counters.h" ··· 123 124 int need_rd_atomic; 124 125 int wait_psn; 125 126 int need_retry; 127 + int wait_for_rnr_timer; 126 128 int noack_pkts; 127 129 struct rxe_task task; 128 130 }; 129 131 130 132 struct rxe_comp_info { 133 + enum rxe_qp_state state; 131 134 u32 psn; 132 135 int opcode; 133 136 int timeout; ··· 156 155 157 156 union { 158 157 struct { 159 - struct sk_buff *skb; 158 + u64 orig_val; 160 159 } atomic; 161 160 struct { 162 161 u64 va_org; ··· 190 189 u32 resid; 191 190 u32 rkey; 192 191 u32 length; 193 - u64 atomic_orig; 194 192 195 193 /* SRQ only */ 196 194 struct { ··· 288 288 struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; 289 289 }; 290 290 291 - struct rxe_map_set { 292 - struct rxe_map **map; 293 - u64 va; 294 - u64 iova; 295 - size_t length; 296 - u32 offset; 297 - u32 nbuf; 298 - int page_shift; 299 - int page_mask; 300 - }; 301 - 302 291 static inline int rkey_is_mw(u32 rkey) 303 292 { 304 293 u32 index = rkey >> 8; ··· 305 316 u32 rkey; 306 317 enum rxe_mr_state state; 307 318 enum ib_mr_type type; 319 + u64 va; 320 + u64 iova; 321 + size_t length; 322 + u32 offset; 308 323 int access; 309 324 325 + int page_shift; 326 + int page_mask; 310 327 int map_shift; 311 328 int map_mask; 312 329 313 330 u32 num_buf; 331 + u32 nbuf; 314 332 315 333 u32 max_buf; 316 334 u32 num_map; 317 335 318 336 atomic_t num_mw; 319 337 320 - struct rxe_map_set *cur_map_set; 321 - struct rxe_map_set *next_map_set; 338 + struct rxe_map **map; 322 339 }; 323 340 324 341 enum rxe_mw_state {
+4 -3
drivers/infiniband/sw/siw/siw_cm.c
··· 725 725 enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; 726 726 727 727 rv = siw_recv_mpa_rr(cep); 728 - if (rv != -EAGAIN) 729 - siw_cancel_mpatimer(cep); 730 728 if (rv) 731 729 goto out_err; 730 + 731 + siw_cancel_mpatimer(cep); 732 732 733 733 rep = &cep->mpa.hdr; 734 734 ··· 895 895 } 896 896 897 897 out_err: 898 - siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); 898 + if (rv != -EAGAIN) 899 + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); 899 900 900 901 return rv; 901 902 }
+1 -1
drivers/infiniband/sw/siw/siw_verbs.c
··· 1167 1167 err_out: 1168 1168 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1169 1169 1170 - if (cq && cq->queue) { 1170 + if (cq->queue) { 1171 1171 struct siw_ucontext *ctx = 1172 1172 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1173 1173 base_ucontext);
+1 -1
drivers/infiniband/ulp/ipoib/ipoib_ib.c
··· 1109 1109 * if he sets the device address back to be based on GID index 0, 1110 1110 * he no longer wishs to control it. 1111 1111 * 1112 - * If the user doesn't control the the device address, 1112 + * If the user doesn't control the device address, 1113 1113 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means 1114 1114 * the port GUID has changed and GID at index 0 has changed 1115 1115 * so we need to change priv->local_gid and priv->dev->dev_addr
+4 -2
drivers/infiniband/ulp/ipoib/ipoib_main.c
··· 1664 1664 { 1665 1665 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1666 1666 1667 - netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); 1668 - netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); 1667 + netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, 1668 + IPOIB_NUM_WC); 1669 + netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, 1670 + MAX_SEND_CQE); 1669 1671 } 1670 1672 1671 1673 static void ipoib_napi_del(struct net_device *dev)
+4 -2
drivers/infiniband/ulp/iser/iser_verbs.c
··· 246 246 device = ib_conn->device; 247 247 ib_dev = device->ib_device; 248 248 249 + /* +1 for drain */ 249 250 if (ib_conn->pi_support) 250 251 max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; 251 252 else ··· 268 267 init_attr.qp_context = (void *)ib_conn; 269 268 init_attr.send_cq = ib_conn->cq; 270 269 init_attr.recv_cq = ib_conn->cq; 271 - init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; 270 + /* +1 for drain */ 271 + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1; 272 272 init_attr.cap.max_send_sge = 2; 273 273 init_attr.cap.max_recv_sge = 1; 274 274 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ··· 487 485 iser_conn, err); 488 486 489 487 /* block until all flush errors are consumed */ 490 - ib_drain_sq(ib_conn->qp); 488 + ib_drain_qp(ib_conn->qp); 491 489 } 492 490 493 491 return 1;
+3 -11
drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c
··· 32 32 33 33 void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats) 34 34 { 35 - struct rtrs_clt_stats_pcpu *s; 36 - 37 - s = get_cpu_ptr(stats->pcpu_stats); 38 - s->rdma.failover_cnt++; 39 - put_cpu_ptr(stats->pcpu_stats); 35 + this_cpu_inc(stats->pcpu_stats->rdma.failover_cnt); 40 36 } 41 37 42 38 int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf) ··· 165 169 static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats, 166 170 size_t size, int d) 167 171 { 168 - struct rtrs_clt_stats_pcpu *s; 169 - 170 - s = get_cpu_ptr(stats->pcpu_stats); 171 - s->rdma.dir[d].cnt++; 172 - s->rdma.dir[d].size_total += size; 173 - put_cpu_ptr(stats->pcpu_stats); 172 + this_cpu_inc(stats->pcpu_stats->rdma.dir[d].cnt); 173 + this_cpu_add(stats->pcpu_stats->rdma.dir[d].size_total, size); 174 174 } 175 175 176 176 void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir)
+22 -28
drivers/infiniband/ulp/rtrs/rtrs-clt.c
··· 740 740 struct rtrs_clt_path *(*next_path)(struct path_it *it); 741 741 }; 742 742 743 - /** 744 - * list_next_or_null_rr_rcu - get next list element in round-robin fashion. 743 + /* 744 + * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL 745 745 * @head: the head for the list. 746 - * @ptr: the list head to take the next element from. 747 - * @type: the type of the struct this is embedded in. 748 - * @memb: the name of the list_head within the struct. 746 + * @clt_path: The element to take the next clt_path from. 749 747 * 750 - * Next element returned in round-robin fashion, i.e. head will be skipped, 748 + * Next clt path returned in round-robin fashion, i.e. head will be skipped, 751 749 * but if list is observed as empty, NULL will be returned. 752 750 * 753 - * This primitive may safely run concurrently with the _rcu list-mutation 751 + * This function may safely run concurrently with the _rcu list-mutation 754 752 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). 755 753 */ 756 - #define list_next_or_null_rr_rcu(head, ptr, type, memb) \ 757 - ({ \ 758 - list_next_or_null_rcu(head, ptr, type, memb) ?: \ 759 - list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \ 760 - type, memb); \ 761 - }) 754 + static inline struct rtrs_clt_path * 755 + rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path) 756 + { 757 + return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?: 758 + list_next_or_null_rcu(head, 759 + READ_ONCE((&clt_path->s.entry)->next), 760 + typeof(*clt_path), s.entry); 761 + } 762 762 763 763 /** 764 764 * get_next_path_rr() - Returns path in round-robin fashion. ··· 789 789 path = list_first_or_null_rcu(&clt->paths_list, 790 790 typeof(*path), s.entry); 791 791 else 792 - path = list_next_or_null_rr_rcu(&clt->paths_list, 793 - &path->s.entry, 794 - typeof(*path), 795 - s.entry); 792 + path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path); 793 + 796 794 rcu_assign_pointer(*ppcpu_path, path); 797 795 798 796 return path; ··· 1401 1403 unsigned int chunk_bits; 1402 1404 int err, i; 1403 1405 1404 - clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth), 1405 - sizeof(long), GFP_KERNEL); 1406 + clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL); 1406 1407 if (!clt->permits_map) { 1407 1408 err = -ENOMEM; 1408 1409 goto out_err; ··· 1423 1426 return 0; 1424 1427 1425 1428 err_map: 1426 - kfree(clt->permits_map); 1429 + bitmap_free(clt->permits_map); 1427 1430 clt->permits_map = NULL; 1428 1431 out_err: 1429 1432 return err; ··· 1431 1434 1432 1435 static void free_permits(struct rtrs_clt_sess *clt) 1433 1436 { 1434 - if (clt->permits_map) { 1435 - size_t sz = clt->queue_depth; 1436 - 1437 + if (clt->permits_map) 1437 1438 wait_event(clt->permits_wait, 1438 - find_first_bit(clt->permits_map, sz) >= sz); 1439 - } 1440 - kfree(clt->permits_map); 1439 + bitmap_empty(clt->permits_map, clt->queue_depth)); 1440 + 1441 + bitmap_free(clt->permits_map); 1441 1442 clt->permits_map = NULL; 1442 1443 kfree(clt->permits); 1443 1444 clt->permits = NULL; ··· 2272 2277 * removed. If @sess is the last element, then @next is NULL. 2273 2278 */ 2274 2279 rcu_read_lock(); 2275 - next = list_next_or_null_rr_rcu(&clt->paths_list, &clt_path->s.entry, 2276 - typeof(*next), s.entry); 2280 + next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path); 2277 2281 rcu_read_unlock(); 2278 2282 2279 2283 /*
+11 -10
drivers/infiniband/ulp/rtrs/rtrs-pri.h
··· 23 23 #define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \ 24 24 __stringify(RTRS_PROTO_VER_MINOR) 25 25 26 + /* 27 + * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) 28 + * and the minimum chunk size is 4096 (2^12). 29 + * So the maximum sess_queue_depth is 65536 (2^16) in theory. 30 + * But mempool_create, create_qp and ib_post_send fail with 31 + * "cannot allocate memory" error if sess_queue_depth is too big. 32 + * Therefore the pratical max value of sess_queue_depth is 33 + * somewhere between 1 and 65534 and it depends on the system. 34 + */ 35 + #define MAX_SESS_QUEUE_DEPTH 65535 36 + 26 37 enum rtrs_imm_const { 27 38 MAX_IMM_TYPE_BITS = 4, 28 39 MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1), ··· 57 46 58 47 MAX_PATHS_NUM = 128, 59 48 60 - /* 61 - * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) 62 - * and the minimum chunk size is 4096 (2^12). 63 - * So the maximum sess_queue_depth is 65536 (2^16) in theory. 64 - * But mempool_create, create_qp and ib_post_send fail with 65 - * "cannot allocate memory" error if sess_queue_depth is too big. 66 - * Therefore the pratical max value of sess_queue_depth is 67 - * somewhere between 1 and 65534 and it depends on the system. 68 - */ 69 - MAX_SESS_QUEUE_DEPTH = 65535, 70 49 MIN_CHUNK_SIZE = 8192, 71 50 72 51 RTRS_HB_INTERVAL_MS = 5000,
+24 -8
drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c
··· 14 14 int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) 15 15 { 16 16 if (enable) { 17 - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; 17 + int cpu; 18 + struct rtrs_srv_stats_rdma_stats *r; 18 19 19 - memset(r, 0, sizeof(*r)); 20 + for_each_possible_cpu(cpu) { 21 + r = per_cpu_ptr(stats->rdma_stats, cpu); 22 + memset(r, 0, sizeof(*r)); 23 + } 24 + 20 25 return 0; 21 26 } 22 27 ··· 30 25 31 26 ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page) 32 27 { 33 - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; 28 + int cpu; 29 + struct rtrs_srv_stats_rdma_stats sum; 30 + struct rtrs_srv_stats_rdma_stats *r; 34 31 35 - return sysfs_emit(page, "%lld %lld %lld %lldn %u\n", 36 - (s64)atomic64_read(&r->dir[READ].cnt), 37 - (s64)atomic64_read(&r->dir[READ].size_total), 38 - (s64)atomic64_read(&r->dir[WRITE].cnt), 39 - (s64)atomic64_read(&r->dir[WRITE].size_total), 0); 32 + memset(&sum, 0, sizeof(sum)); 33 + 34 + for_each_possible_cpu(cpu) { 35 + r = per_cpu_ptr(stats->rdma_stats, cpu); 36 + 37 + sum.dir[READ].cnt += r->dir[READ].cnt; 38 + sum.dir[READ].size_total += r->dir[READ].size_total; 39 + sum.dir[WRITE].cnt += r->dir[WRITE].cnt; 40 + sum.dir[WRITE].size_total += r->dir[WRITE].size_total; 41 + } 42 + 43 + return sysfs_emit(page, "%llu %llu %llu %llu\n", 44 + sum.dir[READ].cnt, sum.dir[READ].size_total, 45 + sum.dir[WRITE].cnt, sum.dir[WRITE].size_total); 40 46 }
+2
drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
··· 220 220 221 221 stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats); 222 222 223 + free_percpu(stats->rdma_stats); 224 + 223 225 kfree(stats); 224 226 } 225 227
+14 -18
drivers/infiniband/ulp/rtrs/rtrs-srv.c
··· 11 11 #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt 12 12 13 13 #include <linux/module.h> 14 - #include <linux/mempool.h> 15 14 16 15 #include "rtrs-srv.h" 17 16 #include "rtrs-log.h" ··· 25 26 #define DEFAULT_SESS_QUEUE_DEPTH 512 26 27 #define MAX_HDR_SIZE PAGE_SIZE 27 28 28 - /* We guarantee to serve 10 paths at least */ 29 - #define CHUNK_POOL_SZ 10 30 - 31 29 static struct rtrs_rdma_dev_pd dev_pd; 32 - static mempool_t *chunk_pool; 33 30 struct class *rtrs_dev_class; 34 31 static struct rtrs_srv_ib_ctx ib_ctx; 35 32 ··· 1353 1358 1354 1359 WARN_ON(refcount_read(&srv->refcount)); 1355 1360 for (i = 0; i < srv->queue_depth; i++) 1356 - mempool_free(srv->chunks[i], chunk_pool); 1361 + __free_pages(srv->chunks[i], get_order(max_chunk_size)); 1357 1362 kfree(srv->chunks); 1358 1363 mutex_destroy(&srv->paths_mutex); 1359 1364 mutex_destroy(&srv->paths_ev_mutex); ··· 1406 1411 goto err_free_srv; 1407 1412 1408 1413 for (i = 0; i < srv->queue_depth; i++) { 1409 - srv->chunks[i] = mempool_alloc(chunk_pool, GFP_KERNEL); 1414 + srv->chunks[i] = alloc_pages(GFP_KERNEL, 1415 + get_order(max_chunk_size)); 1410 1416 if (!srv->chunks[i]) 1411 1417 goto err_free_chunks; 1412 1418 } ··· 1420 1424 1421 1425 err_free_chunks: 1422 1426 while (i--) 1423 - mempool_free(srv->chunks[i], chunk_pool); 1427 + __free_pages(srv->chunks[i], get_order(max_chunk_size)); 1424 1428 kfree(srv->chunks); 1425 1429 1426 1430 err_free_srv: ··· 1509 1513 kobject_del(&srv_path->kobj); 1510 1514 kobject_put(&srv_path->kobj); 1511 1515 } else { 1516 + free_percpu(srv_path->stats->rdma_stats); 1512 1517 kfree(srv_path->stats); 1513 1518 kfree(srv_path); 1514 1519 } ··· 1752 1755 if (!srv_path->stats) 1753 1756 goto err_free_sess; 1754 1757 1758 + srv_path->stats->rdma_stats = alloc_percpu(struct rtrs_srv_stats_rdma_stats); 1759 + if (!srv_path->stats->rdma_stats) 1760 + goto err_free_stats; 1761 + 1755 1762 srv_path->stats->srv_path = srv_path; 1756 1763 1757 1764 srv_path->dma_addr = kcalloc(srv->queue_depth, 1758 1765 sizeof(*srv_path->dma_addr), 1759 1766 GFP_KERNEL); 1760 1767 if (!srv_path->dma_addr) 1761 - goto err_free_stats; 1768 + goto err_free_percpu; 1762 1769 1763 1770 srv_path->s.con = kcalloc(con_num, sizeof(*srv_path->s.con), 1764 1771 GFP_KERNEL); ··· 1814 1813 kfree(srv_path->s.con); 1815 1814 err_free_dma_addr: 1816 1815 kfree(srv_path->dma_addr); 1816 + err_free_percpu: 1817 + free_percpu(srv_path->stats->rdma_stats); 1817 1818 err_free_stats: 1818 1819 kfree(srv_path->stats); 1819 1820 err_free_sess: ··· 2269 2266 err); 2270 2267 return err; 2271 2268 } 2272 - chunk_pool = mempool_create_page_pool(sess_queue_depth * CHUNK_POOL_SZ, 2273 - get_order(max_chunk_size)); 2274 - if (!chunk_pool) 2275 - return -ENOMEM; 2276 2269 rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server"); 2277 2270 if (IS_ERR(rtrs_dev_class)) { 2278 2271 err = PTR_ERR(rtrs_dev_class); 2279 - goto out_chunk_pool; 2272 + goto out_err; 2280 2273 } 2281 2274 rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0); 2282 2275 if (!rtrs_wq) { ··· 2284 2285 2285 2286 out_dev_class: 2286 2287 class_destroy(rtrs_dev_class); 2287 - out_chunk_pool: 2288 - mempool_destroy(chunk_pool); 2289 - 2288 + out_err: 2290 2289 return err; 2291 2290 } 2292 2291 ··· 2292 2295 { 2293 2296 destroy_workqueue(rtrs_wq); 2294 2297 class_destroy(rtrs_dev_class); 2295 - mempool_destroy(chunk_pool); 2296 2298 rtrs_rdma_dev_pd_deinit(&dev_pd); 2297 2299 } 2298 2300
+8 -7
drivers/infiniband/ulp/rtrs/rtrs-srv.h
··· 12 12 13 13 #include <linux/device.h> 14 14 #include <linux/refcount.h> 15 + #include <linux/percpu.h> 15 16 #include "rtrs-pri.h" 16 17 17 18 /* ··· 30 29 */ 31 30 struct rtrs_srv_stats_rdma_stats { 32 31 struct { 33 - atomic64_t cnt; 34 - atomic64_t size_total; 32 + u64 cnt; 33 + u64 size_total; 35 34 } dir[2]; 36 35 }; 37 36 38 37 struct rtrs_srv_stats { 39 - struct kobject kobj_stats; 40 - struct rtrs_srv_stats_rdma_stats rdma_stats; 41 - struct rtrs_srv_path *srv_path; 38 + struct kobject kobj_stats; 39 + struct rtrs_srv_stats_rdma_stats __percpu *rdma_stats; 40 + struct rtrs_srv_path *srv_path; 42 41 }; 43 42 44 43 struct rtrs_srv_con { ··· 131 130 static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, 132 131 size_t size, int d) 133 132 { 134 - atomic64_inc(&s->rdma_stats.dir[d].cnt); 135 - atomic64_add(size, &s->rdma_stats.dir[d].size_total); 133 + this_cpu_inc(s->rdma_stats->dir[d].cnt); 134 + this_cpu_add(s->rdma_stats->dir[d].size_total, size); 136 135 } 137 136 138 137 /* functions which are implemented in rtrs-srv-stats.c */
+110 -46
drivers/infiniband/ulp/srpt/ib_srpt.c
··· 565 565 if (ret) 566 566 return ret; 567 567 568 - sport->port_guid_id.wwn.priv = sport; 569 - srpt_format_guid(sport->port_guid_id.name, 570 - sizeof(sport->port_guid_id.name), 568 + srpt_format_guid(sport->guid_name, ARRAY_SIZE(sport->guid_name), 571 569 &sport->gid.global.interface_id); 572 - sport->port_gid_id.wwn.priv = sport; 573 - snprintf(sport->port_gid_id.name, sizeof(sport->port_gid_id.name), 570 + snprintf(sport->gid_name, ARRAY_SIZE(sport->gid_name), 574 571 "0x%016llx%016llx", 575 572 be64_to_cpu(sport->gid.global.subnet_prefix), 576 573 be64_to_cpu(sport->gid.global.interface_id)); ··· 2218 2221 ch->zw_cqe.done = srpt_zerolength_write_done; 2219 2222 INIT_WORK(&ch->release_work, srpt_release_channel_work); 2220 2223 ch->sport = sport; 2221 - if (ib_cm_id) { 2222 - ch->ib_cm.cm_id = ib_cm_id; 2223 - ib_cm_id->context = ch; 2224 - } else { 2224 + if (rdma_cm_id) { 2225 2225 ch->using_rdma_cm = true; 2226 2226 ch->rdma_cm.cm_id = rdma_cm_id; 2227 2227 rdma_cm_id->context = ch; 2228 + } else { 2229 + ch->ib_cm.cm_id = ib_cm_id; 2230 + ib_cm_id->context = ch; 2228 2231 } 2229 2232 /* 2230 2233 * ch->rq_size should be at least as large as the initiator queue ··· 2311 2314 tag_num = ch->rq_size; 2312 2315 tag_size = 1; /* ib_srpt does not use se_sess->sess_cmd_map */ 2313 2316 2314 - mutex_lock(&sport->port_guid_id.mutex); 2315 - list_for_each_entry(stpg, &sport->port_guid_id.tpg_list, entry) { 2316 - if (!IS_ERR_OR_NULL(ch->sess)) 2317 - break; 2318 - ch->sess = target_setup_session(&stpg->tpg, tag_num, 2317 + if (sport->guid_id) { 2318 + mutex_lock(&sport->guid_id->mutex); 2319 + list_for_each_entry(stpg, &sport->guid_id->tpg_list, entry) { 2320 + if (!IS_ERR_OR_NULL(ch->sess)) 2321 + break; 2322 + ch->sess = target_setup_session(&stpg->tpg, tag_num, 2319 2323 tag_size, TARGET_PROT_NORMAL, 2320 2324 ch->sess_name, ch, NULL); 2325 + } 2326 + mutex_unlock(&sport->guid_id->mutex); 2321 2327 } 2322 - mutex_unlock(&sport->port_guid_id.mutex); 2323 2328 2324 - mutex_lock(&sport->port_gid_id.mutex); 2325 - list_for_each_entry(stpg, &sport->port_gid_id.tpg_list, entry) { 2326 - if (!IS_ERR_OR_NULL(ch->sess)) 2327 - break; 2328 - ch->sess = target_setup_session(&stpg->tpg, tag_num, 2329 + if (sport->gid_id) { 2330 + mutex_lock(&sport->gid_id->mutex); 2331 + list_for_each_entry(stpg, &sport->gid_id->tpg_list, entry) { 2332 + if (!IS_ERR_OR_NULL(ch->sess)) 2333 + break; 2334 + ch->sess = target_setup_session(&stpg->tpg, tag_num, 2329 2335 tag_size, TARGET_PROT_NORMAL, i_port_id, 2330 2336 ch, NULL); 2331 - if (!IS_ERR_OR_NULL(ch->sess)) 2332 - break; 2333 - /* Retry without leading "0x" */ 2334 - ch->sess = target_setup_session(&stpg->tpg, tag_num, 2337 + if (!IS_ERR_OR_NULL(ch->sess)) 2338 + break; 2339 + /* Retry without leading "0x" */ 2340 + ch->sess = target_setup_session(&stpg->tpg, tag_num, 2335 2341 tag_size, TARGET_PROT_NORMAL, 2336 2342 i_port_id + 2, ch, NULL); 2343 + } 2344 + mutex_unlock(&sport->gid_id->mutex); 2337 2345 } 2338 - mutex_unlock(&sport->port_gid_id.mutex); 2339 2346 2340 2347 if (IS_ERR_OR_NULL(ch->sess)) { 2341 2348 WARN_ON_ONCE(ch->sess == NULL); ··· 2984 2983 return 0; 2985 2984 } 2986 2985 2987 - static struct se_wwn *__srpt_lookup_wwn(const char *name) 2986 + struct port_and_port_id { 2987 + struct srpt_port *sport; 2988 + struct srpt_port_id **port_id; 2989 + }; 2990 + 2991 + static struct port_and_port_id __srpt_lookup_port(const char *name) 2988 2992 { 2989 2993 struct ib_device *dev; 2990 2994 struct srpt_device *sdev; ··· 3004 2998 for (i = 0; i < dev->phys_port_cnt; i++) { 3005 2999 sport = &sdev->port[i]; 3006 3000 3007 - if (strcmp(sport->port_guid_id.name, name) == 0) 3008 - return &sport->port_guid_id.wwn; 3009 - if (strcmp(sport->port_gid_id.name, name) == 0) 3010 - return &sport->port_gid_id.wwn; 3001 + if (strcmp(sport->guid_name, name) == 0) { 3002 + kref_get(&sdev->refcnt); 3003 + return (struct port_and_port_id){ 3004 + sport, &sport->guid_id}; 3005 + } 3006 + if (strcmp(sport->gid_name, name) == 0) { 3007 + kref_get(&sdev->refcnt); 3008 + return (struct port_and_port_id){ 3009 + sport, &sport->gid_id}; 3010 + } 3011 3011 } 3012 3012 } 3013 3013 3014 - return NULL; 3014 + return (struct port_and_port_id){}; 3015 3015 } 3016 3016 3017 - static struct se_wwn *srpt_lookup_wwn(const char *name) 3017 + /** 3018 + * srpt_lookup_port() - Look up an RDMA port by name 3019 + * @name: ASCII port name 3020 + * 3021 + * Increments the RDMA port reference count if an RDMA port pointer is returned. 3022 + * The caller must drop that reference count by calling srpt_port_put_ref(). 3023 + */ 3024 + static struct port_and_port_id srpt_lookup_port(const char *name) 3018 3025 { 3019 - struct se_wwn *wwn; 3026 + struct port_and_port_id papi; 3020 3027 3021 3028 spin_lock(&srpt_dev_lock); 3022 - wwn = __srpt_lookup_wwn(name); 3029 + papi = __srpt_lookup_port(name); 3023 3030 spin_unlock(&srpt_dev_lock); 3024 3031 3025 - return wwn; 3032 + return papi; 3026 3033 } 3027 3034 3028 3035 static void srpt_free_srq(struct srpt_device *sdev) ··· 3120 3101 return ret; 3121 3102 } 3122 3103 3104 + static void srpt_free_sdev(struct kref *refcnt) 3105 + { 3106 + struct srpt_device *sdev = container_of(refcnt, typeof(*sdev), refcnt); 3107 + 3108 + kfree(sdev); 3109 + } 3110 + 3111 + static void srpt_sdev_put(struct srpt_device *sdev) 3112 + { 3113 + kref_put(&sdev->refcnt, srpt_free_sdev); 3114 + } 3115 + 3123 3116 /** 3124 3117 * srpt_add_one - InfiniBand device addition callback function 3125 3118 * @device: Describes a HCA. ··· 3150 3119 if (!sdev) 3151 3120 return -ENOMEM; 3152 3121 3122 + kref_init(&sdev->refcnt); 3153 3123 sdev->device = device; 3154 3124 mutex_init(&sdev->sdev_mutex); 3155 3125 ··· 3214 3182 sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE; 3215 3183 sport->port_attrib.use_srq = false; 3216 3184 INIT_WORK(&sport->work, srpt_refresh_port_work); 3217 - mutex_init(&sport->port_guid_id.mutex); 3218 - INIT_LIST_HEAD(&sport->port_guid_id.tpg_list); 3219 - mutex_init(&sport->port_gid_id.mutex); 3220 - INIT_LIST_HEAD(&sport->port_gid_id.tpg_list); 3221 3185 3222 3186 ret = srpt_refresh_port(sport); 3223 3187 if (ret) { ··· 3242 3214 srpt_free_srq(sdev); 3243 3215 ib_dealloc_pd(sdev->pd); 3244 3216 free_dev: 3245 - kfree(sdev); 3217 + srpt_sdev_put(sdev); 3246 3218 pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev)); 3247 3219 return ret; 3248 3220 } ··· 3286 3258 3287 3259 ib_dealloc_pd(sdev->pd); 3288 3260 3289 - kfree(sdev); 3261 + srpt_sdev_put(sdev); 3290 3262 } 3291 3263 3292 3264 static struct ib_client srpt_client = { ··· 3314 3286 { 3315 3287 struct srpt_port *sport = wwn->priv; 3316 3288 3317 - if (wwn == &sport->port_guid_id.wwn) 3318 - return &sport->port_guid_id; 3319 - if (wwn == &sport->port_gid_id.wwn) 3320 - return &sport->port_gid_id; 3289 + if (sport->guid_id && &sport->guid_id->wwn == wwn) 3290 + return sport->guid_id; 3291 + if (sport->gid_id && &sport->gid_id->wwn == wwn) 3292 + return sport->gid_id; 3321 3293 WARN_ON_ONCE(true); 3322 3294 return NULL; 3323 3295 } ··· 3802 3774 struct config_group *group, 3803 3775 const char *name) 3804 3776 { 3805 - return srpt_lookup_wwn(name) ? : ERR_PTR(-EINVAL); 3777 + struct port_and_port_id papi = srpt_lookup_port(name); 3778 + struct srpt_port *sport = papi.sport; 3779 + struct srpt_port_id *port_id; 3780 + 3781 + if (!papi.port_id) 3782 + return ERR_PTR(-EINVAL); 3783 + if (*papi.port_id) { 3784 + /* Attempt to create a directory that already exists. */ 3785 + WARN_ON_ONCE(true); 3786 + return &(*papi.port_id)->wwn; 3787 + } 3788 + port_id = kzalloc(sizeof(*port_id), GFP_KERNEL); 3789 + if (!port_id) { 3790 + srpt_sdev_put(sport->sdev); 3791 + return ERR_PTR(-ENOMEM); 3792 + } 3793 + mutex_init(&port_id->mutex); 3794 + INIT_LIST_HEAD(&port_id->tpg_list); 3795 + port_id->wwn.priv = sport; 3796 + memcpy(port_id->name, port_id == sport->guid_id ? sport->guid_name : 3797 + sport->gid_name, ARRAY_SIZE(port_id->name)); 3798 + 3799 + *papi.port_id = port_id; 3800 + 3801 + return &port_id->wwn; 3806 3802 } 3807 3803 3808 3804 /** ··· 3835 3783 */ 3836 3784 static void srpt_drop_tport(struct se_wwn *wwn) 3837 3785 { 3786 + struct srpt_port_id *port_id = container_of(wwn, typeof(*port_id), wwn); 3787 + struct srpt_port *sport = wwn->priv; 3788 + 3789 + if (sport->guid_id == port_id) 3790 + sport->guid_id = NULL; 3791 + else if (sport->gid_id == port_id) 3792 + sport->gid_id = NULL; 3793 + else 3794 + WARN_ON_ONCE(true); 3795 + 3796 + srpt_sdev_put(sport->sdev); 3797 + kfree(port_id); 3838 3798 } 3839 3799 3840 3800 static ssize_t srpt_wwn_version_show(struct config_item *item, char *buf)
+12 -6
drivers/infiniband/ulp/srpt/ib_srpt.h
··· 376 376 }; 377 377 378 378 /** 379 - * struct srpt_port_id - information about an RDMA port name 379 + * struct srpt_port_id - LIO RDMA port information 380 380 * @mutex: Protects @tpg_list changes. 381 381 * @tpg_list: TPGs associated with the RDMA port name. 382 382 * @wwn: WWN associated with the RDMA port name. ··· 393 393 }; 394 394 395 395 /** 396 - * struct srpt_port - information associated by SRPT with a single IB port 396 + * struct srpt_port - SRPT RDMA port information 397 397 * @sdev: backpointer to the HCA information. 398 398 * @mad_agent: per-port management datagram processing information. 399 399 * @enabled: Whether or not this target port is enabled. ··· 402 402 * @lid: cached value of the port's lid. 403 403 * @gid: cached value of the port's gid. 404 404 * @work: work structure for refreshing the aforementioned cached values. 405 - * @port_guid_id: target port GUID 406 - * @port_gid_id: target port GID 405 + * @guid_name: port name in GUID format. 406 + * @guid_id: LIO target port information for the port name in GUID format. 407 + * @gid_name: port name in GID format. 408 + * @gid_id: LIO target port information for the port name in GID format. 407 409 * @port_attrib: Port attributes that can be accessed through configfs. 408 410 * @refcount: Number of objects associated with this port. 409 411 * @freed_channels: Completion that will be signaled once @refcount becomes 0. ··· 421 419 u32 lid; 422 420 union ib_gid gid; 423 421 struct work_struct work; 424 - struct srpt_port_id port_guid_id; 425 - struct srpt_port_id port_gid_id; 422 + char guid_name[64]; 423 + struct srpt_port_id *guid_id; 424 + char gid_name[64]; 425 + struct srpt_port_id *gid_id; 426 426 struct srpt_port_attrib port_attrib; 427 427 atomic_t refcount; 428 428 struct completion *freed_channels; ··· 434 430 435 431 /** 436 432 * struct srpt_device - information associated by SRPT with a single HCA 433 + * @refcnt: Reference count for this device. 437 434 * @device: Backpointer to the struct ib_device managed by the IB core. 438 435 * @pd: IB protection domain. 439 436 * @lkey: L_Key (local key) with write access to all local memory. ··· 450 445 * @port: Information about the ports owned by this HCA. 451 446 */ 452 447 struct srpt_device { 448 + struct kref refcnt; 453 449 struct ib_device *device; 454 450 struct ib_pd *pd; 455 451 u32 lkey;
+10 -6
drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
··· 50 50 51 51 static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns, 52 52 struct mlx5_flow_table *ft, 53 - unsigned int size, 53 + struct mlx5_flow_table_attr *ft_attr, 54 54 struct mlx5_flow_table *next_ft) 55 55 { 56 - ft->max_fte = size ? roundup_pow_of_two(size) : 1; 56 + int max_fte = ft_attr->max_fte; 57 + 58 + ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1; 57 59 58 60 return 0; 59 61 } ··· 260 258 261 259 static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, 262 260 struct mlx5_flow_table *ft, 263 - unsigned int size, 261 + struct mlx5_flow_table_attr *ft_attr, 264 262 struct mlx5_flow_table *next_ft) 265 263 { 266 264 int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); ··· 269 267 u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {}; 270 268 u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {}; 271 269 struct mlx5_core_dev *dev = ns->dev; 270 + unsigned int size; 272 271 int err; 273 272 274 - if (size != POOL_NEXT_SIZE) 275 - size = roundup_pow_of_two(size); 276 - size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size); 273 + if (ft_attr->max_fte != POOL_NEXT_SIZE) 274 + size = roundup_pow_of_two(ft_attr->max_fte); 275 + size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte); 277 276 if (!size) 278 277 return -ENOSPC; 279 278 280 279 MLX5_SET(create_flow_table_in, in, opcode, 281 280 MLX5_CMD_OP_CREATE_FLOW_TABLE); 282 281 282 + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); 283 283 MLX5_SET(create_flow_table_in, in, table_type, ft->type); 284 284 MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level); 285 285 MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0);
+1 -1
drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
··· 38 38 struct mlx5_flow_cmds { 39 39 int (*create_flow_table)(struct mlx5_flow_root_namespace *ns, 40 40 struct mlx5_flow_table *ft, 41 - unsigned int size, 41 + struct mlx5_flow_table_attr *ft_attr, 42 42 struct mlx5_flow_table *next_ft); 43 43 int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns, 44 44 struct mlx5_flow_table *ft);
+7 -1
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
··· 1155 1155 find_next_chained_ft(fs_prio); 1156 1156 ft->def_miss_action = ns->def_miss_action; 1157 1157 ft->ns = ns; 1158 - err = root->cmds->create_flow_table(root, ft, ft_attr->max_fte, next_ft); 1158 + err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft); 1159 1159 if (err) 1160 1160 goto free_ft; 1161 1161 ··· 1194 1194 return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, 0); 1195 1195 } 1196 1196 EXPORT_SYMBOL(mlx5_create_flow_table); 1197 + 1198 + u32 mlx5_flow_table_id(struct mlx5_flow_table *ft) 1199 + { 1200 + return ft->id; 1201 + } 1202 + EXPORT_SYMBOL(mlx5_flow_table_id); 1197 1203 1198 1204 struct mlx5_flow_table * 1199 1205 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+1
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
··· 439 439 440 440 MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); 441 441 MLX5_SET(create_flow_table_in, in, table_type, attr->table_type); 442 + MLX5_SET(create_flow_table_in, in, uid, attr->uid); 442 443 443 444 ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); 444 445 MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl);
+5 -3
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
··· 214 214 tbl->table_type); 215 215 } 216 216 217 - static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl) 217 + static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid) 218 218 { 219 219 bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); 220 220 bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); ··· 236 236 ft_attr.sw_owner = true; 237 237 ft_attr.decap_en = en_decap; 238 238 ft_attr.reformat_en = en_encap; 239 + ft_attr.uid = uid; 239 240 240 241 ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr, 241 242 NULL, &tbl->table_id); ··· 244 243 return ret; 245 244 } 246 245 247 - struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags) 246 + struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, 247 + u32 flags, u16 uid) 248 248 { 249 249 struct mlx5dr_table *tbl; 250 250 int ret; ··· 265 263 if (ret) 266 264 goto free_tbl; 267 265 268 - ret = dr_table_create_sw_owned_tbl(tbl); 266 + ret = dr_table_create_sw_owned_tbl(tbl, uid); 269 267 if (ret) 270 268 goto uninit_tbl; 271 269
+1
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
··· 1217 1217 1218 1218 struct mlx5dr_cmd_create_flow_table_attr { 1219 1219 u32 table_type; 1220 + u16 uid; 1220 1221 u64 icm_addr_rx; 1221 1222 u64 icm_addr_tx; 1222 1223 u8 level;
+4 -3
drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
··· 62 62 63 63 static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, 64 64 struct mlx5_flow_table *ft, 65 - unsigned int size, 65 + struct mlx5_flow_table_attr *ft_attr, 66 66 struct mlx5_flow_table *next_ft) 67 67 { 68 68 struct mlx5dr_table *tbl; ··· 71 71 72 72 if (mlx5_dr_is_fw_table(ft->flags)) 73 73 return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, 74 - size, 74 + ft_attr, 75 75 next_ft); 76 76 flags = ft->flags; 77 77 /* turn off encap/decap if not supported for sw-str by fw */ ··· 79 79 flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | 80 80 MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); 81 81 82 - tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags); 82 + tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags, 83 + ft_attr->uid); 83 84 if (!tbl) { 84 85 mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); 85 86 return -EINVAL;
+2 -1
drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
··· 51 51 struct mlx5dr_domain *peer_dmn); 52 52 53 53 struct mlx5dr_table * 54 - mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags); 54 + mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags, 55 + u16 uid); 55 56 56 57 struct mlx5dr_table * 57 58 mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft);
+3 -3
include/linux/mlx5/driver.h
··· 733 733 }; 734 734 735 735 enum { 736 - MR_CACHE_LAST_STD_ENTRY = 20, 736 + MKEY_CACHE_LAST_STD_ENTRY = 20, 737 737 MLX5_IMR_MTT_CACHE_ENTRY, 738 738 MLX5_IMR_KSM_CACHE_ENTRY, 739 - MAX_MR_CACHE_ENTRIES 739 + MAX_MKEY_CACHE_ENTRIES 740 740 }; 741 741 742 742 struct mlx5_profile { ··· 745 745 struct { 746 746 int size; 747 747 int limit; 748 - } mr_cache[MAX_MR_CACHE_ENTRIES]; 748 + } mr_cache[MAX_MKEY_CACHE_ENTRIES]; 749 749 }; 750 750 751 751 struct mlx5_hca_cap {
+2
include/linux/mlx5/fs.h
··· 178 178 int max_fte; 179 179 u32 level; 180 180 u32 flags; 181 + u16 uid; 181 182 struct mlx5_flow_table *next_ft; 182 183 183 184 struct { ··· 316 315 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev, 317 316 struct mlx5_pkt_reformat *reformat); 318 317 318 + u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); 319 319 #endif
+4 -2
include/linux/mlx5/mlx5_ifc.h
··· 1371 1371 }; 1372 1372 1373 1373 struct mlx5_ifc_cmd_hca_cap_bits { 1374 - u8 reserved_at_0[0x1f]; 1374 + u8 reserved_at_0[0x10]; 1375 + u8 shared_object_to_user_object_allowed[0x1]; 1376 + u8 reserved_at_13[0xe]; 1375 1377 u8 vhca_resource_manager[0x1]; 1376 1378 1377 1379 u8 hca_cap_2[0x1]; ··· 8530 8528 8531 8529 struct mlx5_ifc_create_flow_table_in_bits { 8532 8530 u8 opcode[0x10]; 8533 - u8 reserved_at_10[0x10]; 8531 + u8 uid[0x10]; 8534 8532 8535 8533 u8 reserved_at_20[0x10]; 8536 8534 u8 op_mod[0x10];
+1 -1
include/rdma/ib_verbs.h
··· 4603 4603 4604 4604 /** 4605 4605 * ib_lid_cpu16 - Return lid in 16bit CPU encoding. 4606 - * In the current implementation the only way to get 4606 + * In the current implementation the only way to 4607 4607 * get the 32bit lid is from other sources for OPA. 4608 4608 * For IB, lids will always be 16bits so cast the 4609 4609 * value accordingly.
+1
include/rdma/rdma_cm.h
··· 108 108 enum rdma_ucm_port_space ps; 109 109 enum ib_qp_type qp_type; 110 110 u32 port_num; 111 + struct work_struct net_work; 111 112 }; 112 113 113 114 struct rdma_cm_id *
+49
include/uapi/rdma/erdma-abi.h
··· 1 + /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ 2 + /* 3 + * Copyright (c) 2020-2022, Alibaba Group. 4 + */ 5 + 6 + #ifndef __ERDMA_USER_H__ 7 + #define __ERDMA_USER_H__ 8 + 9 + #include <linux/types.h> 10 + 11 + #define ERDMA_ABI_VERSION 1 12 + 13 + struct erdma_ureq_create_cq { 14 + __aligned_u64 db_record_va; 15 + __aligned_u64 qbuf_va; 16 + __u32 qbuf_len; 17 + __u32 rsvd0; 18 + }; 19 + 20 + struct erdma_uresp_create_cq { 21 + __u32 cq_id; 22 + __u32 num_cqe; 23 + }; 24 + 25 + struct erdma_ureq_create_qp { 26 + __aligned_u64 db_record_va; 27 + __aligned_u64 qbuf_va; 28 + __u32 qbuf_len; 29 + __u32 rsvd0; 30 + }; 31 + 32 + struct erdma_uresp_create_qp { 33 + __u32 qp_id; 34 + __u32 num_sqe; 35 + __u32 num_rqe; 36 + __u32 rq_offset; 37 + }; 38 + 39 + struct erdma_uresp_alloc_ctx { 40 + __u32 dev_id; 41 + __u32 pad; 42 + __u32 sdb_type; 43 + __u32 sdb_offset; 44 + __aligned_u64 sdb; 45 + __aligned_u64 rdb; 46 + __aligned_u64 cdb; 47 + }; 48 + 49 + #endif
+1
include/uapi/rdma/ib_user_ioctl_verbs.h
··· 250 250 RDMA_DRIVER_QIB, 251 251 RDMA_DRIVER_EFA, 252 252 RDMA_DRIVER_SIW, 253 + RDMA_DRIVER_ERDMA, 253 254 }; 254 255 255 256 enum ib_uverbs_gid_type {
+17
include/uapi/rdma/mlx5_user_ioctl_cmds.h
··· 228 228 MLX5_IB_OBJECT_VAR, 229 229 MLX5_IB_OBJECT_PP, 230 230 MLX5_IB_OBJECT_UAR, 231 + MLX5_IB_OBJECT_STEERING_ANCHOR, 231 232 }; 232 233 233 234 enum mlx5_ib_flow_matcher_create_attrs { ··· 247 246 enum mlx5_ib_flow_matcher_methods { 248 247 MLX5_IB_METHOD_FLOW_MATCHER_CREATE = (1U << UVERBS_ID_NS_SHIFT), 249 248 MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, 249 + }; 250 + 251 + enum mlx5_ib_flow_steering_anchor_create_attrs { 252 + MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), 253 + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, 254 + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, 255 + MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, 256 + }; 257 + 258 + enum mlx5_ib_flow_steering_anchor_destroy_attrs { 259 + MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), 260 + }; 261 + 262 + enum mlx5_ib_steering_anchor_methods { 263 + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE = (1U << UVERBS_ID_NS_SHIFT), 264 + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, 250 265 }; 251 266 252 267 enum mlx5_ib_device_query_context_attrs {