Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netlink: mmaped netlink: ring setup

Add support for mmap'ed RX and TX ring setup and teardown based on the
af_packet.c code. The following patches will use this to add the real
mmap'ed receive and transmit functionality.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Patrick McHardy and committed by
David S. Miller
ccdfcc39 cf0a018a

+327 -2
+32
include/uapi/linux/netlink.h
··· 1 1 #ifndef _UAPI__LINUX_NETLINK_H 2 2 #define _UAPI__LINUX_NETLINK_H 3 3 4 + #include <linux/kernel.h> 4 5 #include <linux/socket.h> /* for __kernel_sa_family_t */ 5 6 #include <linux/types.h> 6 7 ··· 106 105 #define NETLINK_PKTINFO 3 107 106 #define NETLINK_BROADCAST_ERROR 4 108 107 #define NETLINK_NO_ENOBUFS 5 108 + #define NETLINK_RX_RING 6 109 + #define NETLINK_TX_RING 7 109 110 110 111 struct nl_pktinfo { 111 112 __u32 group; 112 113 }; 114 + 115 + struct nl_mmap_req { 116 + unsigned int nm_block_size; 117 + unsigned int nm_block_nr; 118 + unsigned int nm_frame_size; 119 + unsigned int nm_frame_nr; 120 + }; 121 + 122 + struct nl_mmap_hdr { 123 + unsigned int nm_status; 124 + unsigned int nm_len; 125 + __u32 nm_group; 126 + /* credentials */ 127 + __u32 nm_pid; 128 + __u32 nm_uid; 129 + __u32 nm_gid; 130 + }; 131 + 132 + enum nl_mmap_status { 133 + NL_MMAP_STATUS_UNUSED, 134 + NL_MMAP_STATUS_RESERVED, 135 + NL_MMAP_STATUS_VALID, 136 + NL_MMAP_STATUS_COPY, 137 + NL_MMAP_STATUS_SKIP, 138 + }; 139 + 140 + #define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO 141 + #define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) 142 + #define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) 113 143 114 144 #define NET_MAJOR 36 /* Major 36 is reserved for networking */ 115 145
+9
net/Kconfig
··· 23 23 24 24 if NET 25 25 26 + config NETLINK_MMAP 27 + bool "Netlink: mmaped IO" 28 + help 29 + This option enables support for memory mapped netlink IO. This 30 + reduces overhead by avoiding copying data between kernel- and 31 + userspace. 32 + 33 + If unsure, say N. 34 + 26 35 config WANT_COMPAT_NETLINK_MESSAGES 27 36 bool 28 37 help
+266 -2
net/netlink/af_netlink.c
··· 55 55 #include <linux/types.h> 56 56 #include <linux/audit.h> 57 57 #include <linux/mutex.h> 58 + #include <linux/vmalloc.h> 58 59 59 60 #include <net/net_namespace.h> 60 61 #include <net/sock.h> ··· 108 107 return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; 109 108 } 110 109 110 + #ifdef CONFIG_NETLINK_MMAP 111 + static __pure struct page *pgvec_to_page(const void *addr) 112 + { 113 + if (is_vmalloc_addr(addr)) 114 + return vmalloc_to_page(addr); 115 + else 116 + return virt_to_page(addr); 117 + } 118 + 119 + static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) 120 + { 121 + unsigned int i; 122 + 123 + for (i = 0; i < len; i++) { 124 + if (pg_vec[i] != NULL) { 125 + if (is_vmalloc_addr(pg_vec[i])) 126 + vfree(pg_vec[i]); 127 + else 128 + free_pages((unsigned long)pg_vec[i], order); 129 + } 130 + } 131 + kfree(pg_vec); 132 + } 133 + 134 + static void *alloc_one_pg_vec_page(unsigned long order) 135 + { 136 + void *buffer; 137 + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | 138 + __GFP_NOWARN | __GFP_NORETRY; 139 + 140 + buffer = (void *)__get_free_pages(gfp_flags, order); 141 + if (buffer != NULL) 142 + return buffer; 143 + 144 + buffer = vzalloc((1 << order) * PAGE_SIZE); 145 + if (buffer != NULL) 146 + return buffer; 147 + 148 + gfp_flags &= ~__GFP_NORETRY; 149 + return (void *)__get_free_pages(gfp_flags, order); 150 + } 151 + 152 + static void **alloc_pg_vec(struct netlink_sock *nlk, 153 + struct nl_mmap_req *req, unsigned int order) 154 + { 155 + unsigned int block_nr = req->nm_block_nr; 156 + unsigned int i; 157 + void **pg_vec, *ptr; 158 + 159 + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); 160 + if (pg_vec == NULL) 161 + return NULL; 162 + 163 + for (i = 0; i < block_nr; i++) { 164 + pg_vec[i] = ptr = alloc_one_pg_vec_page(order); 165 + if (pg_vec[i] == NULL) 166 + goto err1; 167 + } 168 + 169 + return pg_vec; 170 + err1: 171 + free_pg_vec(pg_vec, order, block_nr); 172 + return NULL; 173 + } 174 + 175 + static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, 176 + bool closing, bool tx_ring) 177 + { 178 + struct netlink_sock *nlk = nlk_sk(sk); 179 + struct netlink_ring *ring; 180 + struct sk_buff_head *queue; 181 + void **pg_vec = NULL; 182 + unsigned int order = 0; 183 + int err; 184 + 185 + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; 186 + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 187 + 188 + if (!closing) { 189 + if (atomic_read(&nlk->mapped)) 190 + return -EBUSY; 191 + if (atomic_read(&ring->pending)) 192 + return -EBUSY; 193 + } 194 + 195 + if (req->nm_block_nr) { 196 + if (ring->pg_vec != NULL) 197 + return -EBUSY; 198 + 199 + if ((int)req->nm_block_size <= 0) 200 + return -EINVAL; 201 + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) 202 + return -EINVAL; 203 + if (req->nm_frame_size < NL_MMAP_HDRLEN) 204 + return -EINVAL; 205 + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) 206 + return -EINVAL; 207 + 208 + ring->frames_per_block = req->nm_block_size / 209 + req->nm_frame_size; 210 + if (ring->frames_per_block == 0) 211 + return -EINVAL; 212 + if (ring->frames_per_block * req->nm_block_nr != 213 + req->nm_frame_nr) 214 + return -EINVAL; 215 + 216 + order = get_order(req->nm_block_size); 217 + pg_vec = alloc_pg_vec(nlk, req, order); 218 + if (pg_vec == NULL) 219 + return -ENOMEM; 220 + } else { 221 + if (req->nm_frame_nr) 222 + return -EINVAL; 223 + } 224 + 225 + err = -EBUSY; 226 + mutex_lock(&nlk->pg_vec_lock); 227 + if (closing || atomic_read(&nlk->mapped) == 0) { 228 + err = 0; 229 + spin_lock_bh(&queue->lock); 230 + 231 + ring->frame_max = req->nm_frame_nr - 1; 232 + ring->head = 0; 233 + ring->frame_size = req->nm_frame_size; 234 + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; 235 + 236 + swap(ring->pg_vec_len, req->nm_block_nr); 237 + swap(ring->pg_vec_order, order); 238 + swap(ring->pg_vec, pg_vec); 239 + 240 + __skb_queue_purge(queue); 241 + spin_unlock_bh(&queue->lock); 242 + 243 + WARN_ON(atomic_read(&nlk->mapped)); 244 + } 245 + mutex_unlock(&nlk->pg_vec_lock); 246 + 247 + if (pg_vec) 248 + free_pg_vec(pg_vec, order, req->nm_block_nr); 249 + return err; 250 + } 251 + 252 + static void netlink_mm_open(struct vm_area_struct *vma) 253 + { 254 + struct file *file = vma->vm_file; 255 + struct socket *sock = file->private_data; 256 + struct sock *sk = sock->sk; 257 + 258 + if (sk) 259 + atomic_inc(&nlk_sk(sk)->mapped); 260 + } 261 + 262 + static void netlink_mm_close(struct vm_area_struct *vma) 263 + { 264 + struct file *file = vma->vm_file; 265 + struct socket *sock = file->private_data; 266 + struct sock *sk = sock->sk; 267 + 268 + if (sk) 269 + atomic_dec(&nlk_sk(sk)->mapped); 270 + } 271 + 272 + static const struct vm_operations_struct netlink_mmap_ops = { 273 + .open = netlink_mm_open, 274 + .close = netlink_mm_close, 275 + }; 276 + 277 + static int netlink_mmap(struct file *file, struct socket *sock, 278 + struct vm_area_struct *vma) 279 + { 280 + struct sock *sk = sock->sk; 281 + struct netlink_sock *nlk = nlk_sk(sk); 282 + struct netlink_ring *ring; 283 + unsigned long start, size, expected; 284 + unsigned int i; 285 + int err = -EINVAL; 286 + 287 + if (vma->vm_pgoff) 288 + return -EINVAL; 289 + 290 + mutex_lock(&nlk->pg_vec_lock); 291 + 292 + expected = 0; 293 + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { 294 + if (ring->pg_vec == NULL) 295 + continue; 296 + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; 297 + } 298 + 299 + if (expected == 0) 300 + goto out; 301 + 302 + size = vma->vm_end - vma->vm_start; 303 + if (size != expected) 304 + goto out; 305 + 306 + start = vma->vm_start; 307 + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { 308 + if (ring->pg_vec == NULL) 309 + continue; 310 + 311 + for (i = 0; i < ring->pg_vec_len; i++) { 312 + struct page *page; 313 + void *kaddr = ring->pg_vec[i]; 314 + unsigned int pg_num; 315 + 316 + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { 317 + page = pgvec_to_page(kaddr); 318 + err = vm_insert_page(vma, start, page); 319 + if (err < 0) 320 + goto out; 321 + start += PAGE_SIZE; 322 + kaddr += PAGE_SIZE; 323 + } 324 + } 325 + } 326 + 327 + atomic_inc(&nlk->mapped); 328 + vma->vm_ops = &netlink_mmap_ops; 329 + err = 0; 330 + out: 331 + mutex_unlock(&nlk->pg_vec_lock); 332 + return 0; 333 + } 334 + #else /* CONFIG_NETLINK_MMAP */ 335 + #define netlink_mmap sock_no_mmap 336 + #endif /* CONFIG_NETLINK_MMAP */ 337 + 111 338 static void netlink_destroy_callback(struct netlink_callback *cb) 112 339 { 113 340 kfree_skb(cb->skb); ··· 375 146 } 376 147 377 148 skb_queue_purge(&sk->sk_receive_queue); 149 + #ifdef CONFIG_NETLINK_MMAP 150 + if (1) { 151 + struct nl_mmap_req req; 152 + 153 + memset(&req, 0, sizeof(req)); 154 + if (nlk->rx_ring.pg_vec) 155 + netlink_set_ring(sk, &req, true, false); 156 + memset(&req, 0, sizeof(req)); 157 + if (nlk->tx_ring.pg_vec) 158 + netlink_set_ring(sk, &req, true, true); 159 + } 160 + #endif /* CONFIG_NETLINK_MMAP */ 378 161 379 162 if (!sock_flag(sk, SOCK_DEAD)) { 380 163 printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); ··· 650 409 mutex_init(nlk->cb_mutex); 651 410 } 652 411 init_waitqueue_head(&nlk->wait); 412 + #ifdef CONFIG_NETLINK_MMAP 413 + mutex_init(&nlk->pg_vec_lock); 414 + #endif 653 415 654 416 sk->sk_destruct = netlink_sock_destruct; 655 417 sk->sk_protocol = protocol; ··· 1455 1211 if (level != SOL_NETLINK) 1456 1212 return -ENOPROTOOPT; 1457 1213 1458 - if (optlen >= sizeof(int) && 1214 + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && 1215 + optlen >= sizeof(int) && 1459 1216 get_user(val, (unsigned int __user *)optval)) 1460 1217 return -EFAULT; 1461 1218 ··· 1505 1260 } 1506 1261 err = 0; 1507 1262 break; 1263 + #ifdef CONFIG_NETLINK_MMAP 1264 + case NETLINK_RX_RING: 1265 + case NETLINK_TX_RING: { 1266 + struct nl_mmap_req req; 1267 + 1268 + /* Rings might consume more memory than queue limits, require 1269 + * CAP_NET_ADMIN. 1270 + */ 1271 + if (!capable(CAP_NET_ADMIN)) 1272 + return -EPERM; 1273 + if (optlen < sizeof(req)) 1274 + return -EINVAL; 1275 + if (copy_from_user(&req, optval, sizeof(req))) 1276 + return -EFAULT; 1277 + err = netlink_set_ring(sk, &req, false, 1278 + optname == NETLINK_TX_RING); 1279 + break; 1280 + } 1281 + #endif /* CONFIG_NETLINK_MMAP */ 1508 1282 default: 1509 1283 err = -ENOPROTOOPT; 1510 1284 } ··· 2357 2093 .getsockopt = netlink_getsockopt, 2358 2094 .sendmsg = netlink_sendmsg, 2359 2095 .recvmsg = netlink_recvmsg, 2360 - .mmap = sock_no_mmap, 2096 + .mmap = netlink_mmap, 2361 2097 .sendpage = sock_no_sendpage, 2362 2098 }; 2363 2099
+20
net/netlink/af_netlink.h
··· 6 6 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) 7 7 #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) 8 8 9 + struct netlink_ring { 10 + void **pg_vec; 11 + unsigned int head; 12 + unsigned int frames_per_block; 13 + unsigned int frame_size; 14 + unsigned int frame_max; 15 + 16 + unsigned int pg_vec_order; 17 + unsigned int pg_vec_pages; 18 + unsigned int pg_vec_len; 19 + 20 + atomic_t pending; 21 + }; 22 + 9 23 struct netlink_sock { 10 24 /* struct sock has to be the first member of netlink_sock */ 11 25 struct sock sk; ··· 38 24 void (*netlink_rcv)(struct sk_buff *skb); 39 25 void (*netlink_bind)(int group); 40 26 struct module *module; 27 + #ifdef CONFIG_NETLINK_MMAP 28 + struct mutex pg_vec_lock; 29 + struct netlink_ring rx_ring; 30 + struct netlink_ring tx_ring; 31 + atomic_t mapped; 32 + #endif /* CONFIG_NETLINK_MMAP */ 41 33 }; 42 34 43 35 static inline struct netlink_sock *nlk_sk(struct sock *sk)