Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rds: add type of service(tos) infrastructure

RDS Service type (TOS) is user-defined and needs to be configured
via RDS IOCTL interface. It must be set before initiating any
traffic and once set the TOS can not be changed. All out-going
traffic from the socket will be associated with its TOS.

Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
[yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes]
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>

+72 -17
+11
include/uapi/linux/rds.h
··· 69 69 #define RDS_TRANS_COUNT 3 70 70 #define RDS_TRANS_NONE (~0) 71 71 72 + /* IOCTLS commands for SOL_RDS */ 73 + #define SIOCRDSSETTOS (SIOCPROTOPRIVATE) 74 + #define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1) 75 + 76 + typedef __u8 rds_tos_t; 77 + 72 78 /* 73 79 * Control message types for SOL_RDS. 74 80 * ··· 155 149 __be32 faddr; 156 150 __u8 transport[TRANSNAMSIZ]; /* null term ascii */ 157 151 __u8 flags; 152 + __u8 tos; 158 153 } __attribute__((packed)); 159 154 160 155 struct rds6_info_connection { ··· 178 171 __be16 lport; 179 172 __be16 fport; 180 173 __u8 flags; 174 + __u8 tos; 181 175 } __attribute__((packed)); 182 176 183 177 struct rds6_info_message { ··· 222 214 __u32 last_sent_nxt; 223 215 __u32 last_expected_una; 224 216 __u32 last_seen_una; 217 + __u8 tos; 225 218 } __attribute__((packed)); 226 219 227 220 struct rds6_info_tcp_socket { ··· 249 240 __u32 max_send_sge; 250 241 __u32 rdma_mr_max; 251 242 __u32 rdma_mr_size; 243 + __u8 tos; 252 244 }; 253 245 254 246 struct rds6_info_rdma_connection { ··· 263 253 __u32 max_send_sge; 264 254 __u32 rdma_mr_max; 265 255 __u32 rdma_mr_size; 256 + __u8 tos; 266 257 }; 267 258 268 259 /* RDS message Receive Path Latency points */
+34 -1
net/rds/af_rds.c
··· 254 254 255 255 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 256 256 { 257 - return -ENOIOCTLCMD; 257 + struct rds_sock *rs = rds_sk_to_rs(sock->sk); 258 + rds_tos_t tos; 259 + 260 + switch (cmd) { 261 + case SIOCRDSSETTOS: 262 + if (get_user(tos, (rds_tos_t __user *)arg)) 263 + return -EFAULT; 264 + 265 + if (rs->rs_transport && 266 + rs->rs_transport->t_type == RDS_TRANS_TCP) 267 + tos = 0; 268 + 269 + spin_lock_bh(&rds_sock_lock); 270 + if (rs->rs_tos || rs->rs_conn) { 271 + spin_unlock_bh(&rds_sock_lock); 272 + return -EINVAL; 273 + } 274 + rs->rs_tos = tos; 275 + spin_unlock_bh(&rds_sock_lock); 276 + break; 277 + case SIOCRDSGETTOS: 278 + spin_lock_bh(&rds_sock_lock); 279 + tos = rs->rs_tos; 280 + spin_unlock_bh(&rds_sock_lock); 281 + if (put_user(tos, (rds_tos_t __user *)arg)) 282 + return -EFAULT; 283 + break; 284 + default: 285 + return -ENOIOCTLCMD; 286 + } 287 + 288 + return 0; 258 289 } 259 290 260 291 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, ··· 681 650 spin_lock_init(&rs->rs_rdma_lock); 682 651 rs->rs_rdma_keys = RB_ROOT; 683 652 rs->rs_rx_traces = 0; 653 + rs->rs_tos = 0; 654 + rs->rs_conn = NULL; 684 655 685 656 spin_lock_bh(&rds_sock_lock); 686 657 list_add_tail(&rs->rs_item, &rds_sock_list);
+11 -9
net/rds/connection.c
··· 84 84 const struct in6_addr *laddr, 85 85 const struct in6_addr *faddr, 86 86 struct rds_transport *trans, 87 - int dev_if) 87 + u8 tos, int dev_if) 88 88 { 89 89 struct rds_connection *conn, *ret = NULL; 90 90 ··· 92 92 if (ipv6_addr_equal(&conn->c_faddr, faddr) && 93 93 ipv6_addr_equal(&conn->c_laddr, laddr) && 94 94 conn->c_trans == trans && 95 + conn->c_tos == tos && 95 96 net == rds_conn_net(conn) && 96 97 conn->c_dev_if == dev_if) { 97 98 ret = conn; ··· 161 160 const struct in6_addr *laddr, 162 161 const struct in6_addr *faddr, 163 162 struct rds_transport *trans, 164 - gfp_t gfp, 163 + gfp_t gfp, u8 tos, 165 164 int is_outgoing, 166 165 int dev_if) 167 166 { ··· 173 172 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 174 173 175 174 rcu_read_lock(); 176 - conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); 175 + conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); 177 176 if (conn && 178 177 conn->c_loopback && 179 178 conn->c_trans != &rds_loop_transport && ··· 207 206 conn->c_isv6 = !ipv6_addr_v4mapped(laddr); 208 207 conn->c_faddr = *faddr; 209 208 conn->c_dev_if = dev_if; 209 + conn->c_tos = tos; 210 210 211 211 #if IS_ENABLED(CONFIG_IPV6) 212 212 /* If the local address is link local, set c_bound_if to be the ··· 300 298 struct rds_connection *found; 301 299 302 300 found = rds_conn_lookup(net, head, laddr, faddr, trans, 303 - dev_if); 301 + tos, dev_if); 304 302 if (found) { 305 303 struct rds_conn_path *cp; 306 304 int i; ··· 335 333 struct rds_connection *rds_conn_create(struct net *net, 336 334 const struct in6_addr *laddr, 337 335 const struct in6_addr *faddr, 338 - struct rds_transport *trans, gfp_t gfp, 339 - int dev_if) 336 + struct rds_transport *trans, u8 tos, 337 + gfp_t gfp, int dev_if) 340 338 { 341 - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); 339 + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); 342 340 } 343 341 EXPORT_SYMBOL_GPL(rds_conn_create); 344 342 ··· 346 344 const struct in6_addr *laddr, 347 345 const struct in6_addr *faddr, 348 346 struct rds_transport *trans, 349 - gfp_t gfp, int dev_if) 347 + u8 tos, gfp_t gfp, int dev_if) 350 348 { 351 - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); 349 + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); 352 350 } 353 351 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 354 352
+1
net/rds/ib.c
··· 301 301 302 302 iinfo->src_addr = conn->c_laddr.s6_addr32[3]; 303 303 iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; 304 + iinfo->tos = conn->c_tos; 304 305 305 306 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 306 307 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+1 -1
net/rds/ib_cm.c
··· 786 786 787 787 /* RDS/IB is not currently netns aware, thus init_net */ 788 788 conn = rds_conn_create(&init_net, daddr6, saddr6, 789 - &rds_ib_transport, GFP_KERNEL, ifindex); 789 + &rds_ib_transport, 0, GFP_KERNEL, ifindex); 790 790 if (IS_ERR(conn)) { 791 791 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 792 792 conn = NULL;
+1
net/rds/rdma_transport.c
··· 115 115 pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", 116 116 &conn->c_laddr, &conn->c_faddr); 117 117 conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; 118 + conn->c_tos = 0; 118 119 rds_conn_drop(conn); 119 120 } 120 121 rdsdebug("Connection rejected: %s\n",
+7 -2
net/rds/rds.h
··· 158 158 unsigned int c_version; 159 159 possible_net_t c_net; 160 160 161 + /* TOS */ 162 + u8 c_tos; 163 + 161 164 struct list_head c_map_item; 162 165 unsigned long c_map_queued; 163 166 ··· 655 652 u8 rs_rx_traces; 656 653 u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; 657 654 struct rds_msg_zcopy_queue rs_zcookie_queue; 655 + u8 rs_tos; 658 656 }; 659 657 660 658 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) ··· 764 760 struct rds_connection *rds_conn_create(struct net *net, 765 761 const struct in6_addr *laddr, 766 762 const struct in6_addr *faddr, 767 - struct rds_transport *trans, gfp_t gfp, 763 + struct rds_transport *trans, 764 + u8 tos, gfp_t gfp, 768 765 int dev_if); 769 766 struct rds_connection *rds_conn_create_outgoing(struct net *net, 770 767 const struct in6_addr *laddr, 771 768 const struct in6_addr *faddr, 772 769 struct rds_transport *trans, 773 - gfp_t gfp, int dev_if); 770 + u8 tos, gfp_t gfp, int dev_if); 774 771 void rds_conn_shutdown(struct rds_conn_path *cpath); 775 772 void rds_conn_destroy(struct rds_connection *conn); 776 773 void rds_conn_drop(struct rds_connection *conn);
+1
net/rds/recv.c
··· 782 782 783 783 minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); 784 784 minfo.len = be32_to_cpu(inc->i_hdr.h_len); 785 + minfo.tos = inc->i_conn->c_tos; 785 786 786 787 if (flip) { 787 788 minfo.laddr = daddr;
+3 -3
net/rds/send.c
··· 1277 1277 1278 1278 /* rds_conn_create has a spinlock that runs with IRQ off. 1279 1279 * Caching the conn in the socket helps a lot. */ 1280 - if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) 1280 + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) { 1281 1281 conn = rs->rs_conn; 1282 - else { 1282 + } else { 1283 1283 conn = rds_conn_create_outgoing(sock_net(sock->sk), 1284 1284 &rs->rs_bound_addr, &daddr, 1285 - rs->rs_transport, 1285 + rs->rs_transport, 0, 1286 1286 sock->sk->sk_allocation, 1287 1287 scope_id); 1288 1288 if (IS_ERR(conn)) {
+1
net/rds/tcp.c
··· 267 267 tsinfo.last_sent_nxt = tc->t_last_sent_nxt; 268 268 tsinfo.last_expected_una = tc->t_last_expected_una; 269 269 tsinfo.last_seen_una = tc->t_last_seen_una; 270 + tsinfo.tos = tc->t_cpath->cp_conn->c_tos; 270 271 271 272 rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); 272 273 }
+1 -1
net/rds/tcp_listen.c
··· 200 200 201 201 conn = rds_conn_create(sock_net(sock->sk), 202 202 my_addr, peer_addr, 203 - &rds_tcp_transport, GFP_KERNEL, dev_if); 203 + &rds_tcp_transport, 0, GFP_KERNEL, dev_if); 204 204 205 205 if (IS_ERR(conn)) { 206 206 ret = PTR_ERR(conn);