xen-netback: coalesce slots in TX path and fix regressions

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This patch tries to coalesce tx requests when constructing grant copy
structures. It enables netback to deal with situation when frontend's
MAX_SKB_FRAGS is larger than backend's MAX_SKB_FRAGS.

With the help of coalescing, this patch tries to address two regressions
avoid reopening the security hole in XSA-39.

Regression 1. The reduction of the number of supported ring entries (slots)
per packet (from 18 to 17). This regression has been around for some time but
remains unnoticed until XSA-39 security fix. This is fixed by coalescing
slots.

Regression 2. The XSA-39 security fix turning "too many frags" errors from
just dropping the packet to a fatal error and disabling the VIF. This is fixed
by coalescing slots (handling 18 slots when backend's MAX_SKB_FRAGS is 17)
which rules out false positive (using 18 slots is legit) and dropping packets
using 19 to `max_skb_slots` slots.

To avoid reopening security hole in XSA-39, frontend sending packet using more
than max_skb_slots is considered malicious.

The behavior of netback for packet is thus:

1-18 slots: valid
19-max_skb_slots slots: drop and respond with an error
max_skb_slots+ slots: fatal error

max_skb_slots is configurable by admin, default value is 20.

Also change variable name from "frags" to "slots" in netbk_count_requests.

Please note that RX path still has dependency on MAX_SKB_FRAGS. This will be
fixed with separate patch.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Wei Liu and committed by

David S. Miller 13 years ago 2810e5b9 9ecd1a75

+242 -51

2 changed files

expand all

drivers

net

xen-netback

netback.c

include

xen

interface

netif.h

+224 -51

drivers/net/xen-netback/netback.c

··· 47 47 #include <asm/xen/hypercall.h> 48 48 #include <asm/xen/page.h> 49 49 50 - struct pending_tx_info { 51 - struct xen_netif_tx_request req; 52 - struct xenvif *vif; 53 - }; 50 + /* 51 + * This is the maximum slots a skb can have. If a guest sends a skb 52 + * which exceeds this limit it is considered malicious. 53 + */ 54 + #define MAX_SKB_SLOTS_DEFAULT 20 55 + static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT; 56 + module_param(max_skb_slots, uint, 0444); 57 + 54 58 typedef unsigned int pending_ring_idx_t; 59 + #define INVALID_PENDING_RING_IDX (~0U) 60 + 61 + struct pending_tx_info { 62 + struct xen_netif_tx_request req; /* coalesced tx request */ 63 + struct xenvif *vif; 64 + pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX 65 + * if it is head of one or more tx 66 + * reqs 67 + */ 68 + }; 55 69 56 70 struct netbk_rx_meta { 57 71 int id; ··· 116 102 atomic_t netfront_count; 117 103 118 104 struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; 119 - struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; 105 + /* Coalescing tx requests before copying makes number of grant 106 + * copy ops greater or equal to number of slots required. In 107 + * worst case a tx request consumes 2 gnttab_copy. 108 + */ 109 + struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; 120 110 121 111 u16 pending_ring[MAX_PENDING_REQS]; 122 112 ··· 135 117 136 118 static struct xen_netbk *xen_netbk; 137 119 static int xen_netbk_group_nr; 120 + 121 + /* 122 + * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of 123 + * one or more merged tx requests, otherwise it is the continuation of 124 + * previous tx request. 125 + */ 126 + static inline int pending_tx_is_head(struct xen_netbk *netbk, RING_IDX idx) 127 + { 128 + return netbk->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX; 129 + } 138 130 139 131 void xen_netbk_add_xenvif(struct xenvif *vif) 140 132 { ··· 278 250 { 279 251 int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); 280 252 253 + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ 281 254 if (vif->can_sg || vif->gso || vif->gso_prefix) 282 255 max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ 283 256 ··· 686 657 __skb_queue_tail(&rxq, skb); 687 658 688 659 /* Filled the batch queue? */ 660 + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ 689 661 if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) 690 662 break; 691 663 } ··· 928 898 929 899 static int netbk_count_requests(struct xenvif *vif, 930 900 struct xen_netif_tx_request *first, 901 + RING_IDX first_idx, 931 902 struct xen_netif_tx_request *txp, 932 903 int work_to_do) 933 904 { 934 905 RING_IDX cons = vif->tx.req_cons; 935 - int frags = 0; 906 + int slots = 0; 907 + int drop_err = 0; 936 908 937 909 if (!(first->flags & XEN_NETTXF_more_data)) 938 910 return 0; 939 911 940 912 do { 941 - if (frags >= work_to_do) { 942 - netdev_err(vif->dev, "Need more frags\n"); 913 + if (slots >= work_to_do) { 914 + netdev_err(vif->dev, 915 + "Asked for %d slots but exceeds this limit\n", 916 + work_to_do); 943 917 netbk_fatal_tx_err(vif); 944 918 return -ENODATA; 945 919 } 946 920 947 - if (unlikely(frags >= MAX_SKB_FRAGS)) { 948 - netdev_err(vif->dev, "Too many frags\n"); 921 + /* This guest is really using too many slots and 922 + * considered malicious. 923 + */ 924 + if (unlikely(slots >= max_skb_slots)) { 925 + netdev_err(vif->dev, 926 + "Malicious frontend using %d slots, threshold %u\n", 927 + slots, max_skb_slots); 949 928 netbk_fatal_tx_err(vif); 950 929 return -E2BIG; 951 930 } 952 931 953 - memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags), 932 + /* Xen network protocol had implicit dependency on 933 + * MAX_SKB_FRAGS. XEN_NETIF_NR_SLOTS_MIN is set to the 934 + * historical MAX_SKB_FRAGS value 18 to honor the same 935 + * behavior as before. Any packet using more than 18 936 + * slots but less than max_skb_slots slots is dropped 937 + */ 938 + if (!drop_err && slots >= XEN_NETIF_NR_SLOTS_MIN) { 939 + if (net_ratelimit()) 940 + netdev_dbg(vif->dev, 941 + "Too many slots (%d) exceeding limit (%d), dropping packet\n", 942 + slots, XEN_NETIF_NR_SLOTS_MIN); 943 + drop_err = -E2BIG; 944 + } 945 + 946 + memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots), 954 947 sizeof(*txp)); 955 948 if (txp->size > first->size) { 956 - netdev_err(vif->dev, "Frag is bigger than frame.\n"); 949 + netdev_err(vif->dev, 950 + "Invalid tx request, slot size %u > remaining size %u\n", 951 + txp->size, first->size); 957 952 netbk_fatal_tx_err(vif); 958 953 return -EIO; 959 954 } 960 955 961 956 first->size -= txp->size; 962 - frags++; 957 + slots++; 963 958 964 959 if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { 965 - netdev_err(vif->dev, "txp->offset: %x, size: %u\n", 960 + netdev_err(vif->dev, "Cross page boundary, txp->offset: %x, size: %u\n", 966 961 txp->offset, txp->size); 967 962 netbk_fatal_tx_err(vif); 968 963 return -EINVAL; 969 964 } 970 965 } while ((txp++)->flags & XEN_NETTXF_more_data); 971 - return frags; 966 + 967 + if (drop_err) { 968 + netbk_tx_err(vif, first, first_idx + slots); 969 + return drop_err; 970 + } 971 + 972 + return slots; 972 973 } 973 974 974 975 static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk, ··· 1023 962 struct skb_shared_info *shinfo = skb_shinfo(skb); 1024 963 skb_frag_t *frags = shinfo->frags; 1025 964 u16 pending_idx = *((u16 *)skb->data); 1026 - int i, start; 965 + u16 head_idx = 0; 966 + int slot, start; 967 + struct page *page; 968 + pending_ring_idx_t index, start_idx = 0; 969 + uint16_t dst_offset; 970 + unsigned int nr_slots; 971 + struct pending_tx_info *first = NULL; 972 + 973 + /* At this point shinfo->nr_frags is in fact the number of 974 + * slots, which can be as large as XEN_NETIF_NR_SLOTS_MIN. 975 + */ 976 + nr_slots = shinfo->nr_frags; 1027 977 1028 978 /* Skip first skb fragment if it is on same page as header fragment. */ 1029 979 start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); 1030 980 1031 - for (i = start; i < shinfo->nr_frags; i++, txp++) { 1032 - struct page *page; 1033 - pending_ring_idx_t index; 981 + /* Coalesce tx requests, at this point the packet passed in 982 + * should be <= 64K. Any packets larger than 64K have been 983 + * handled in netbk_count_requests(). 984 + */ 985 + for (shinfo->nr_frags = slot = start; slot < nr_slots; 986 + shinfo->nr_frags++) { 1034 987 struct pending_tx_info *pending_tx_info = 1035 988 netbk->pending_tx_info; 1036 989 1037 - index = pending_index(netbk->pending_cons++); 1038 - pending_idx = netbk->pending_ring[index]; 1039 - page = xen_netbk_alloc_page(netbk, pending_idx); 990 + page = alloc_page(GFP_KERNEL|__GFP_COLD); 1040 991 if (!page) 1041 992 goto err; 1042 993 1043 - gop->source.u.ref = txp->gref; 1044 - gop->source.domid = vif->domid; 1045 - gop->source.offset = txp->offset; 994 + dst_offset = 0; 995 + first = NULL; 996 + while (dst_offset < PAGE_SIZE && slot < nr_slots) { 997 + gop->flags = GNTCOPY_source_gref; 1046 998 1047 - gop->dest.u.gmfn = virt_to_mfn(page_address(page)); 1048 - gop->dest.domid = DOMID_SELF; 1049 - gop->dest.offset = txp->offset; 999 + gop->source.u.ref = txp->gref; 1000 + gop->source.domid = vif->domid; 1001 + gop->source.offset = txp->offset; 1050 1002 1051 - gop->len = txp->size; 1052 - gop->flags = GNTCOPY_source_gref; 1003 + gop->dest.domid = DOMID_SELF; 1053 1004 1054 - gop++; 1005 + gop->dest.offset = dst_offset; 1006 + gop->dest.u.gmfn = virt_to_mfn(page_address(page)); 1055 1007 1056 - memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); 1057 - xenvif_get(vif); 1058 - pending_tx_info[pending_idx].vif = vif; 1059 - frag_set_pending_idx(&frags[i], pending_idx); 1008 + if (dst_offset + txp->size > PAGE_SIZE) { 1009 + /* This page can only merge a portion 1010 + * of tx request. Do not increment any 1011 + * pointer / counter here. The txp 1012 + * will be dealt with in future 1013 + * rounds, eventually hitting the 1014 + * `else` branch. 1015 + */ 1016 + gop->len = PAGE_SIZE - dst_offset; 1017 + txp->offset += gop->len; 1018 + txp->size -= gop->len; 1019 + dst_offset += gop->len; /* quit loop */ 1020 + } else { 1021 + /* This tx request can be merged in the page */ 1022 + gop->len = txp->size; 1023 + dst_offset += gop->len; 1024 + 1025 + index = pending_index(netbk->pending_cons++); 1026 + 1027 + pending_idx = netbk->pending_ring[index]; 1028 + 1029 + memcpy(&pending_tx_info[pending_idx].req, txp, 1030 + sizeof(*txp)); 1031 + xenvif_get(vif); 1032 + 1033 + pending_tx_info[pending_idx].vif = vif; 1034 + 1035 + /* Poison these fields, corresponding 1036 + * fields for head tx req will be set 1037 + * to correct values after the loop. 1038 + */ 1039 + netbk->mmap_pages[pending_idx] = (void *)(~0UL); 1040 + pending_tx_info[pending_idx].head = 1041 + INVALID_PENDING_RING_IDX; 1042 + 1043 + if (!first) { 1044 + first = &pending_tx_info[pending_idx]; 1045 + start_idx = index; 1046 + head_idx = pending_idx; 1047 + } 1048 + 1049 + txp++; 1050 + slot++; 1051 + } 1052 + 1053 + gop++; 1054 + } 1055 + 1056 + first->req.offset = 0; 1057 + first->req.size = dst_offset; 1058 + first->head = start_idx; 1059 + set_page_ext(page, netbk, head_idx); 1060 + netbk->mmap_pages[head_idx] = page; 1061 + frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx); 1060 1062 } 1063 + 1064 + BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS); 1061 1065 1062 1066 return gop; 1063 1067 err: 1064 1068 /* Unwind, freeing all pages and sending error responses. */ 1065 - while (i-- > start) { 1066 - xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]), 1067 - XEN_NETIF_RSP_ERROR); 1069 + while (shinfo->nr_frags-- > start) { 1070 + xen_netbk_idx_release(netbk, 1071 + frag_get_pending_idx(&frags[shinfo->nr_frags]), 1072 + XEN_NETIF_RSP_ERROR); 1068 1073 } 1069 1074 /* The head too, if necessary. */ 1070 1075 if (start) ··· 1146 1019 struct gnttab_copy *gop = *gopp; 1147 1020 u16 pending_idx = *((u16 *)skb->data); 1148 1021 struct skb_shared_info *shinfo = skb_shinfo(skb); 1022 + struct pending_tx_info *tx_info; 1149 1023 int nr_frags = shinfo->nr_frags; 1150 1024 int i, err, start; 1025 + u16 peek; /* peek into next tx request */ 1151 1026 1152 1027 /* Check status of header. */ 1153 1028 err = gop->status; ··· 1161 1032 1162 1033 for (i = start; i < nr_frags; i++) { 1163 1034 int j, newerr; 1035 + pending_ring_idx_t head; 1164 1036 1165 1037 pending_idx = frag_get_pending_idx(&shinfo->frags[i]); 1038 + tx_info = &netbk->pending_tx_info[pending_idx]; 1039 + head = tx_info->head; 1166 1040 1167 1041 /* Check error status: if okay then remember grant handle. */ 1168 - newerr = (++gop)->status; 1042 + do { 1043 + newerr = (++gop)->status; 1044 + if (newerr) 1045 + break; 1046 + peek = netbk->pending_ring[pending_index(++head)]; 1047 + } while (!pending_tx_is_head(netbk, peek)); 1048 + 1169 1049 if (likely(!newerr)) { 1170 1050 /* Had a previous error? Invalidate this fragment. */ 1171 1051 if (unlikely(err)) ··· 1394 1256 struct sk_buff *skb; 1395 1257 int ret; 1396 1258 1397 - while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && 1259 + while ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN 1260 + < MAX_PENDING_REQS) && 1398 1261 !list_empty(&netbk->net_schedule_list)) { 1399 1262 struct xenvif *vif; 1400 1263 struct xen_netif_tx_request txreq; 1401 - struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; 1264 + struct xen_netif_tx_request txfrags[max_skb_slots]; 1402 1265 struct page *page; 1403 1266 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; 1404 1267 u16 pending_idx; ··· 1460 1321 continue; 1461 1322 } 1462 1323 1463 - ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do); 1324 + ret = netbk_count_requests(vif, &txreq, idx, 1325 + txfrags, work_to_do); 1464 1326 if (unlikely(ret < 0)) 1465 1327 continue; 1466 1328 ··· 1488 1348 pending_idx = netbk->pending_ring[index]; 1489 1349 1490 1350 data_len = (txreq.size > PKT_PROT_LEN && 1491 - ret < MAX_SKB_FRAGS) ? 1351 + ret < XEN_NETIF_NR_SLOTS_MIN) ? 1492 1352 PKT_PROT_LEN : txreq.size; 1493 1353 1494 1354 skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, ··· 1538 1398 memcpy(&netbk->pending_tx_info[pending_idx].req, 1539 1399 &txreq, sizeof(txreq)); 1540 1400 netbk->pending_tx_info[pending_idx].vif = vif; 1401 + netbk->pending_tx_info[pending_idx].head = index; 1541 1402 *((u16 *)skb->data) = pending_idx; 1542 1403 1543 1404 __skb_put(skb, data_len); ··· 1669 1528 { 1670 1529 struct xenvif *vif; 1671 1530 struct pending_tx_info *pending_tx_info; 1672 - pending_ring_idx_t index; 1531 + pending_ring_idx_t head; 1532 + u16 peek; /* peek into next tx request */ 1533 + 1534 + BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL)); 1673 1535 1674 1536 /* Already complete? */ 1675 1537 if (netbk->mmap_pages[pending_idx] == NULL) ··· 1681 1537 pending_tx_info = &netbk->pending_tx_info[pending_idx]; 1682 1538 1683 1539 vif = pending_tx_info->vif; 1540 + head = pending_tx_info->head; 1684 1541 1685 - make_tx_response(vif, &pending_tx_info->req, status); 1542 + BUG_ON(!pending_tx_is_head(netbk, head)); 1543 + BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx); 1686 1544 1687 - index = pending_index(netbk->pending_prod++); 1688 - netbk->pending_ring[index] = pending_idx; 1545 + do { 1546 + pending_ring_idx_t index; 1547 + pending_ring_idx_t idx = pending_index(head); 1548 + u16 info_idx = netbk->pending_ring[idx]; 1689 1549 1690 - xenvif_put(vif); 1550 + pending_tx_info = &netbk->pending_tx_info[info_idx]; 1551 + make_tx_response(vif, &pending_tx_info->req, status); 1691 1552 1692 - netbk->mmap_pages[pending_idx]->mapping = NULL; 1553 + /* Setting any number other than 1554 + * INVALID_PENDING_RING_IDX indicates this slot is 1555 + * starting a new packet / ending a previous packet. 1556 + */ 1557 + pending_tx_info->head = 0; 1558 + 1559 + index = pending_index(netbk->pending_prod++); 1560 + netbk->pending_ring[index] = netbk->pending_ring[info_idx]; 1561 + 1562 + xenvif_put(vif); 1563 + 1564 + peek = netbk->pending_ring[pending_index(++head)]; 1565 + 1566 + } while (!pending_tx_is_head(netbk, peek)); 1567 + 1568 + netbk->mmap_pages[pending_idx]->mapping = 0; 1693 1569 put_page(netbk->mmap_pages[pending_idx]); 1694 1570 netbk->mmap_pages[pending_idx] = NULL; 1695 1571 } 1572 + 1696 1573 1697 1574 static void make_tx_response(struct xenvif *vif, 1698 1575 struct xen_netif_tx_request *txp, ··· 1767 1602 static inline int tx_work_todo(struct xen_netbk *netbk) 1768 1603 { 1769 1604 1770 - if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && 1771 - !list_empty(&netbk->net_schedule_list)) 1605 + if ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN 1606 + < MAX_PENDING_REQS) && 1607 + !list_empty(&netbk->net_schedule_list)) 1772 1608 return 1; 1773 1609 1774 1610 return 0; ··· 1851 1685 1852 1686 if (!xen_domain()) 1853 1687 return -ENODEV; 1688 + 1689 + if (max_skb_slots < XEN_NETIF_NR_SLOTS_MIN) { 1690 + printk(KERN_INFO 1691 + "xen-netback: max_skb_slots too small (%d), bump it to XEN_NETIF_NR_SLOTS_MIN (%d)\n", 1692 + max_skb_slots, XEN_NETIF_NR_SLOTS_MIN); 1693 + max_skb_slots = XEN_NETIF_NR_SLOTS_MIN; 1694 + } 1854 1695 1855 1696 xen_netbk_group_nr = num_online_cpus(); 1856 1697 xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);

+18

include/xen/interface/io/netif.h

··· 13 13 #include <xen/interface/grant_table.h> 14 14 15 15 /* 16 + * Older implementation of Xen network frontend / backend has an 17 + * implicit dependency on the MAX_SKB_FRAGS as the maximum number of 18 + * ring slots a skb can use. Netfront / netback may not work as 19 + * expected when frontend and backend have different MAX_SKB_FRAGS. 20 + * 21 + * A better approach is to add mechanism for netfront / netback to 22 + * negotiate this value. However we cannot fix all possible 23 + * frontends, so we need to define a value which states the minimum 24 + * slots backend must support. 25 + * 26 + * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS 27 + * (18), which is proved to work with most frontends. Any new backend 28 + * which doesn't negotiate with frontend should expect frontend to 29 + * send a valid packet using slots up to this value. 30 + */ 31 + #define XEN_NETIF_NR_SLOTS_MIN 18 32 + 33 + /* 16 34 * Notifications after enqueuing any type of message should be conditional on 17 35 * the appropriate req_event or rsp_event field in the shared ring. 18 36 * If the client sends notification for rx requests then it should specify