Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: add network priority cgroup infrastructure (v4)

This patch adds in the infrastructure code to create the network priority
cgroup. The cgroup, in addition to the standard processes file creates two
control files:

1) prioidx - This is a read-only file that exports the index of this cgroup.
This is a value that is both arbitrary and unique to a cgroup in this subsystem,
and is used to index the per-device priority map

2) priomap - This is a writeable file. On read it reports a table of 2-tuples
<name:priority> where name is the name of a network interface and priority is
indicates the priority assigned to frames egresessing on the named interface and
originating from a pid in this cgroup

This cgroup allows for skb priority to be set prior to a root qdisc getting
selected. This is benenficial for DCB enabled systems, in that it allows for any
application to use dcb configured priorities so without application modification

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
CC: Robert Love <robert.w.love@intel.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Neil Horman and committed by
David S. Miller
5bc1421e 202ff1c2

+469 -1
+8
include/linux/cgroup_subsys.h
··· 59 59 SUBSYS(blkio) 60 60 #endif 61 61 62 + /* */ 63 + 62 64 #ifdef CONFIG_CGROUP_PERF 63 65 SUBSYS(perf) 66 + #endif 67 + 68 + /* */ 69 + 70 + #ifdef CONFIG_NETPRIO_CGROUP 71 + SUBSYS(net_prio) 64 72 #endif 65 73 66 74 /* */
+4
include/linux/netdevice.h
··· 50 50 #ifdef CONFIG_DCB 51 51 #include <net/dcbnl.h> 52 52 #endif 53 + #include <net/netprio_cgroup.h> 53 54 54 55 #include <linux/netdev_features.h> 55 56 ··· 1245 1244 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) 1246 1245 /* max exchange id for FCoE LRO by ddp */ 1247 1246 unsigned int fcoe_ddp_xid; 1247 + #endif 1248 + #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) 1249 + struct netprio_map __rcu *priomap; 1248 1250 #endif 1249 1251 /* phy device may attach itself for hardware timestamping */ 1250 1252 struct phy_device *phydev;
+65
include/net/netprio_cgroup.h
··· 1 + /* 2 + * netprio_cgroup.h Control Group Priority set 3 + * 4 + * 5 + * Authors: Neil Horman <nhorman@tuxdriver.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify it 8 + * under the terms of the GNU General Public License as published by the Free 9 + * Software Foundation; either version 2 of the License, or (at your option) 10 + * any later version. 11 + * 12 + */ 13 + 14 + #ifndef _NETPRIO_CGROUP_H 15 + #define _NETPRIO_CGROUP_H 16 + #include <linux/module.h> 17 + #include <linux/cgroup.h> 18 + #include <linux/hardirq.h> 19 + #include <linux/rcupdate.h> 20 + 21 + struct cgroup_netprio_state 22 + { 23 + struct cgroup_subsys_state css; 24 + u32 prioidx; 25 + }; 26 + 27 + struct netprio_map { 28 + struct rcu_head rcu; 29 + u32 priomap_len; 30 + u32 priomap[]; 31 + }; 32 + 33 + #ifdef CONFIG_CGROUPS 34 + 35 + #ifndef CONFIG_NETPRIO_CGROUP 36 + extern int net_prio_subsys_id; 37 + #endif 38 + 39 + extern void sock_update_netprioidx(struct sock *sk); 40 + 41 + static inline struct cgroup_netprio_state 42 + *task_netprio_state(struct task_struct *p) 43 + { 44 + #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) 45 + return container_of(task_subsys_state(p, net_prio_subsys_id), 46 + struct cgroup_netprio_state, css); 47 + #else 48 + return NULL; 49 + #endif 50 + } 51 + 52 + #else 53 + 54 + #define sock_update_netprioidx(sk) 55 + #define skb_update_prio(skb) 56 + 57 + static inline struct cgroup_netprio_state 58 + *task_netprio_state(struct task_struct *p) 59 + { 60 + return NULL; 61 + } 62 + 63 + #endif 64 + 65 + #endif /* _NET_CLS_CGROUP_H */
+3
include/net/sock.h
··· 320 320 unsigned short sk_ack_backlog; 321 321 unsigned short sk_max_ack_backlog; 322 322 __u32 sk_priority; 323 + #ifdef CONFIG_CGROUPS 324 + __u32 sk_cgrp_prioidx; 325 + #endif 323 326 struct pid *sk_peer_pid; 324 327 const struct cred *sk_peer_cred; 325 328 long sk_rcvtimeo;
+7
net/Kconfig
··· 232 232 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS 233 233 default y 234 234 235 + config NETPRIO_CGROUP 236 + tristate "Network priority cgroup" 237 + depends on CGROUPS 238 + ---help--- 239 + Cgroup subsystem for use in assigning processes to network priorities on 240 + a per-interface basis 241 + 235 242 config HAVE_BPF_JIT 236 243 bool 237 244
+1
net/core/Makefile
··· 19 19 obj-$(CONFIG_TRACEPOINTS) += net-traces.o 20 20 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o 21 21 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o 22 + obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
+14
net/core/dev.c
··· 2449 2449 return rc; 2450 2450 } 2451 2451 2452 + #if IS_ENABLED(CONFIG_NETPRIO_CGROUP) 2453 + static void skb_update_prio(struct sk_buff *skb) 2454 + { 2455 + struct netprio_map *map = rcu_dereference(skb->dev->priomap); 2456 + 2457 + if ((!skb->priority) && (skb->sk) && map) 2458 + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; 2459 + } 2460 + #else 2461 + #define skb_update_prio(skb) 2462 + #endif 2463 + 2452 2464 static DEFINE_PER_CPU(int, xmit_recursion); 2453 2465 #define RECURSION_LIMIT 10 2454 2466 ··· 2500 2488 * stops preemption for RCU. 2501 2489 */ 2502 2490 rcu_read_lock_bh(); 2491 + 2492 + skb_update_prio(skb); 2503 2493 2504 2494 txq = dev_pick_tx(dev, skb); 2505 2495 q = rcu_dereference_bh(txq->qdisc);
+344
net/core/netprio_cgroup.c
··· 1 + /* 2 + * net/core/netprio_cgroup.c Priority Control Group 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + * 9 + * Authors: Neil Horman <nhorman@tuxdriver.com> 10 + */ 11 + 12 + #include <linux/module.h> 13 + #include <linux/slab.h> 14 + #include <linux/types.h> 15 + #include <linux/string.h> 16 + #include <linux/errno.h> 17 + #include <linux/skbuff.h> 18 + #include <linux/cgroup.h> 19 + #include <linux/rcupdate.h> 20 + #include <linux/atomic.h> 21 + #include <net/rtnetlink.h> 22 + #include <net/pkt_cls.h> 23 + #include <net/sock.h> 24 + #include <net/netprio_cgroup.h> 25 + 26 + static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, 27 + struct cgroup *cgrp); 28 + static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); 29 + static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); 30 + 31 + struct cgroup_subsys net_prio_subsys = { 32 + .name = "net_prio", 33 + .create = cgrp_create, 34 + .destroy = cgrp_destroy, 35 + .populate = cgrp_populate, 36 + #ifdef CONFIG_NETPRIO_CGROUP 37 + .subsys_id = net_prio_subsys_id, 38 + #endif 39 + .module = THIS_MODULE 40 + }; 41 + 42 + #define PRIOIDX_SZ 128 43 + 44 + static unsigned long prioidx_map[PRIOIDX_SZ]; 45 + static DEFINE_SPINLOCK(prioidx_map_lock); 46 + static atomic_t max_prioidx = ATOMIC_INIT(0); 47 + 48 + static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) 49 + { 50 + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), 51 + struct cgroup_netprio_state, css); 52 + } 53 + 54 + static int get_prioidx(u32 *prio) 55 + { 56 + unsigned long flags; 57 + u32 prioidx; 58 + 59 + spin_lock_irqsave(&prioidx_map_lock, flags); 60 + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); 61 + set_bit(prioidx, prioidx_map); 62 + spin_unlock_irqrestore(&prioidx_map_lock, flags); 63 + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) 64 + return -ENOSPC; 65 + 66 + atomic_set(&max_prioidx, prioidx); 67 + *prio = prioidx; 68 + return 0; 69 + } 70 + 71 + static void put_prioidx(u32 idx) 72 + { 73 + unsigned long flags; 74 + 75 + spin_lock_irqsave(&prioidx_map_lock, flags); 76 + clear_bit(idx, prioidx_map); 77 + spin_unlock_irqrestore(&prioidx_map_lock, flags); 78 + } 79 + 80 + static void extend_netdev_table(struct net_device *dev, u32 new_len) 81 + { 82 + size_t new_size = sizeof(struct netprio_map) + 83 + ((sizeof(u32) * new_len)); 84 + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); 85 + struct netprio_map *old_priomap; 86 + int i; 87 + 88 + old_priomap = rtnl_dereference(dev->priomap); 89 + 90 + if (!new_priomap) { 91 + printk(KERN_WARNING "Unable to alloc new priomap!\n"); 92 + return; 93 + } 94 + 95 + for (i = 0; 96 + old_priomap && (i < old_priomap->priomap_len); 97 + i++) 98 + new_priomap->priomap[i] = old_priomap->priomap[i]; 99 + 100 + new_priomap->priomap_len = new_len; 101 + 102 + rcu_assign_pointer(dev->priomap, new_priomap); 103 + if (old_priomap) 104 + kfree_rcu(old_priomap, rcu); 105 + } 106 + 107 + static void update_netdev_tables(void) 108 + { 109 + struct net_device *dev; 110 + u32 max_len = atomic_read(&max_prioidx); 111 + struct netprio_map *map; 112 + 113 + rtnl_lock(); 114 + for_each_netdev(&init_net, dev) { 115 + map = rtnl_dereference(dev->priomap); 116 + if ((!map) || 117 + (map->priomap_len < max_len)) 118 + extend_netdev_table(dev, max_len); 119 + } 120 + rtnl_unlock(); 121 + } 122 + 123 + static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, 124 + struct cgroup *cgrp) 125 + { 126 + struct cgroup_netprio_state *cs; 127 + int ret; 128 + 129 + cs = kzalloc(sizeof(*cs), GFP_KERNEL); 130 + if (!cs) 131 + return ERR_PTR(-ENOMEM); 132 + 133 + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { 134 + kfree(cs); 135 + return ERR_PTR(-EINVAL); 136 + } 137 + 138 + ret = get_prioidx(&cs->prioidx); 139 + if (ret != 0) { 140 + printk(KERN_WARNING "No space in priority index array\n"); 141 + kfree(cs); 142 + return ERR_PTR(ret); 143 + } 144 + 145 + return &cs->css; 146 + } 147 + 148 + static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 149 + { 150 + struct cgroup_netprio_state *cs; 151 + struct net_device *dev; 152 + struct netprio_map *map; 153 + 154 + cs = cgrp_netprio_state(cgrp); 155 + rtnl_lock(); 156 + for_each_netdev(&init_net, dev) { 157 + map = rtnl_dereference(dev->priomap); 158 + if (map) 159 + map->priomap[cs->prioidx] = 0; 160 + } 161 + rtnl_unlock(); 162 + put_prioidx(cs->prioidx); 163 + kfree(cs); 164 + } 165 + 166 + static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) 167 + { 168 + return (u64)cgrp_netprio_state(cgrp)->prioidx; 169 + } 170 + 171 + static int read_priomap(struct cgroup *cont, struct cftype *cft, 172 + struct cgroup_map_cb *cb) 173 + { 174 + struct net_device *dev; 175 + u32 prioidx = cgrp_netprio_state(cont)->prioidx; 176 + u32 priority; 177 + struct netprio_map *map; 178 + 179 + rcu_read_lock(); 180 + for_each_netdev_rcu(&init_net, dev) { 181 + map = rcu_dereference(dev->priomap); 182 + priority = map ? map->priomap[prioidx] : 0; 183 + cb->fill(cb, dev->name, priority); 184 + } 185 + rcu_read_unlock(); 186 + return 0; 187 + } 188 + 189 + static int write_priomap(struct cgroup *cgrp, struct cftype *cft, 190 + const char *buffer) 191 + { 192 + char *devname = kstrdup(buffer, GFP_KERNEL); 193 + int ret = -EINVAL; 194 + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; 195 + unsigned long priority; 196 + char *priostr; 197 + struct net_device *dev; 198 + struct netprio_map *map; 199 + 200 + if (!devname) 201 + return -ENOMEM; 202 + 203 + /* 204 + * Minimally sized valid priomap string 205 + */ 206 + if (strlen(devname) < 3) 207 + goto out_free_devname; 208 + 209 + priostr = strstr(devname, " "); 210 + if (!priostr) 211 + goto out_free_devname; 212 + 213 + /* 214 + *Separate the devname from the associated priority 215 + *and advance the priostr poitner to the priority value 216 + */ 217 + *priostr = '\0'; 218 + priostr++; 219 + 220 + /* 221 + * If the priostr points to NULL, we're at the end of the passed 222 + * in string, and its not a valid write 223 + */ 224 + if (*priostr == '\0') 225 + goto out_free_devname; 226 + 227 + ret = kstrtoul(priostr, 10, &priority); 228 + if (ret < 0) 229 + goto out_free_devname; 230 + 231 + ret = -ENODEV; 232 + 233 + dev = dev_get_by_name(&init_net, devname); 234 + if (!dev) 235 + goto out_free_devname; 236 + 237 + update_netdev_tables(); 238 + ret = 0; 239 + rcu_read_lock(); 240 + map = rcu_dereference(dev->priomap); 241 + if (map) 242 + map->priomap[prioidx] = priority; 243 + rcu_read_unlock(); 244 + dev_put(dev); 245 + 246 + out_free_devname: 247 + kfree(devname); 248 + return ret; 249 + } 250 + 251 + static struct cftype ss_files[] = { 252 + { 253 + .name = "prioidx", 254 + .read_u64 = read_prioidx, 255 + }, 256 + { 257 + .name = "ifpriomap", 258 + .read_map = read_priomap, 259 + .write_string = write_priomap, 260 + }, 261 + }; 262 + 263 + static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 264 + { 265 + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); 266 + } 267 + 268 + static int netprio_device_event(struct notifier_block *unused, 269 + unsigned long event, void *ptr) 270 + { 271 + struct net_device *dev = ptr; 272 + struct netprio_map *old; 273 + u32 max_len = atomic_read(&max_prioidx); 274 + 275 + /* 276 + * Note this is called with rtnl_lock held so we have update side 277 + * protection on our rcu assignments 278 + */ 279 + 280 + switch (event) { 281 + 282 + case NETDEV_REGISTER: 283 + if (max_len) 284 + extend_netdev_table(dev, max_len); 285 + break; 286 + case NETDEV_UNREGISTER: 287 + old = rtnl_dereference(dev->priomap); 288 + rcu_assign_pointer(dev->priomap, NULL); 289 + if (old) 290 + kfree_rcu(old, rcu); 291 + break; 292 + } 293 + return NOTIFY_DONE; 294 + } 295 + 296 + static struct notifier_block netprio_device_notifier = { 297 + .notifier_call = netprio_device_event 298 + }; 299 + 300 + static int __init init_cgroup_netprio(void) 301 + { 302 + int ret; 303 + 304 + ret = cgroup_load_subsys(&net_prio_subsys); 305 + if (ret) 306 + goto out; 307 + #ifndef CONFIG_NETPRIO_CGROUP 308 + smp_wmb(); 309 + net_prio_subsys_id = net_prio_subsys.subsys_id; 310 + #endif 311 + 312 + register_netdevice_notifier(&netprio_device_notifier); 313 + 314 + out: 315 + return ret; 316 + } 317 + 318 + static void __exit exit_cgroup_netprio(void) 319 + { 320 + struct netprio_map *old; 321 + struct net_device *dev; 322 + 323 + unregister_netdevice_notifier(&netprio_device_notifier); 324 + 325 + cgroup_unload_subsys(&net_prio_subsys); 326 + 327 + #ifndef CONFIG_NETPRIO_CGROUP 328 + net_prio_subsys_id = -1; 329 + synchronize_rcu(); 330 + #endif 331 + 332 + rtnl_lock(); 333 + for_each_netdev(&init_net, dev) { 334 + old = rtnl_dereference(dev->priomap); 335 + rcu_assign_pointer(dev->priomap, NULL); 336 + if (old) 337 + kfree_rcu(old, rcu); 338 + } 339 + rtnl_unlock(); 340 + } 341 + 342 + module_init(init_cgroup_netprio); 343 + module_exit(exit_cgroup_netprio); 344 + MODULE_LICENSE("GPL v2");
+21 -1
net/core/sock.c
··· 125 125 #include <net/xfrm.h> 126 126 #include <linux/ipsec.h> 127 127 #include <net/cls_cgroup.h> 128 + #include <net/netprio_cgroup.h> 128 129 129 130 #include <linux/filter.h> 130 131 ··· 222 221 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 223 222 EXPORT_SYMBOL(sysctl_optmem_max); 224 223 225 - #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) 224 + #if defined(CONFIG_CGROUPS) 225 + #if !defined(CONFIG_NET_CLS_CGROUP) 226 226 int net_cls_subsys_id = -1; 227 227 EXPORT_SYMBOL_GPL(net_cls_subsys_id); 228 + #endif 229 + #if !defined(CONFIG_NETPRIO_CGROUP) 230 + int net_prio_subsys_id = -1; 231 + EXPORT_SYMBOL_GPL(net_prio_subsys_id); 232 + #endif 228 233 #endif 229 234 230 235 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) ··· 1127 1120 sk->sk_classid = classid; 1128 1121 } 1129 1122 EXPORT_SYMBOL(sock_update_classid); 1123 + 1124 + void sock_update_netprioidx(struct sock *sk) 1125 + { 1126 + struct cgroup_netprio_state *state; 1127 + if (in_interrupt()) 1128 + return; 1129 + rcu_read_lock(); 1130 + state = task_netprio_state(current); 1131 + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; 1132 + rcu_read_unlock(); 1133 + } 1134 + EXPORT_SYMBOL_GPL(sock_update_netprioidx); 1130 1135 #endif 1131 1136 1132 1137 /** ··· 1166 1147 atomic_set(&sk->sk_wmem_alloc, 1); 1167 1148 1168 1149 sock_update_classid(sk); 1150 + sock_update_netprioidx(sk); 1169 1151 } 1170 1152 1171 1153 return sk;
+2
net/socket.c
··· 551 551 552 552 sock_update_classid(sock->sk); 553 553 554 + sock_update_netprioidx(sock->sk); 555 + 554 556 si->sock = sock; 555 557 si->scm = NULL; 556 558 si->msg = msg;