Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDS: Info and stats

RDS currently generates a lot of stats that are accessible via
the rds-info utility. This code implements the support for this.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Andy Grover and committed by
David S. Miller
a8c879a7 0fbc78cb

+419
+241
net/rds/info.c
··· 1 + /* 2 + * Copyright (c) 2006 Oracle. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the 8 + * OpenIB.org BSD license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or 11 + * without modification, are permitted provided that the following 12 + * conditions are met: 13 + * 14 + * - Redistributions of source code must retain the above 15 + * copyright notice, this list of conditions and the following 16 + * disclaimer. 17 + * 18 + * - Redistributions in binary form must reproduce the above 19 + * copyright notice, this list of conditions and the following 20 + * disclaimer in the documentation and/or other materials 21 + * provided with the distribution. 22 + * 23 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 + * SOFTWARE. 31 + * 32 + */ 33 + #include <linux/percpu.h> 34 + #include <linux/seq_file.h> 35 + #include <linux/proc_fs.h> 36 + 37 + #include "rds.h" 38 + 39 + /* 40 + * This file implements a getsockopt() call which copies a set of fixed 41 + * sized structs into a user-specified buffer as a means of providing 42 + * read-only information about RDS. 43 + * 44 + * For a given information source there are a given number of fixed sized 45 + * structs at a given time. The structs are only copied if the user-specified 46 + * buffer is big enough. The destination pages that make up the buffer 47 + * are pinned for the duration of the copy. 48 + * 49 + * This gives us the following benefits: 50 + * 51 + * - simple implementation, no copy "position" across multiple calls 52 + * - consistent snapshot of an info source 53 + * - atomic copy works well with whatever locking info source has 54 + * - one portable tool to get rds info across implementations 55 + * - long-lived tool can get info without allocating 56 + * 57 + * at the following costs: 58 + * 59 + * - info source copy must be pinned, may be "large" 60 + */ 61 + 62 + struct rds_info_iterator { 63 + struct page **pages; 64 + void *addr; 65 + unsigned long offset; 66 + }; 67 + 68 + static DEFINE_SPINLOCK(rds_info_lock); 69 + static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; 70 + 71 + void rds_info_register_func(int optname, rds_info_func func) 72 + { 73 + int offset = optname - RDS_INFO_FIRST; 74 + 75 + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); 76 + 77 + spin_lock(&rds_info_lock); 78 + BUG_ON(rds_info_funcs[offset] != NULL); 79 + rds_info_funcs[offset] = func; 80 + spin_unlock(&rds_info_lock); 81 + } 82 + 83 + void rds_info_deregister_func(int optname, rds_info_func func) 84 + { 85 + int offset = optname - RDS_INFO_FIRST; 86 + 87 + BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); 88 + 89 + spin_lock(&rds_info_lock); 90 + BUG_ON(rds_info_funcs[offset] != func); 91 + rds_info_funcs[offset] = NULL; 92 + spin_unlock(&rds_info_lock); 93 + } 94 + 95 + /* 96 + * Typically we hold an atomic kmap across multiple rds_info_copy() calls 97 + * because the kmap is so expensive. This must be called before using blocking 98 + * operations while holding the mapping and as the iterator is torn down. 99 + */ 100 + void rds_info_iter_unmap(struct rds_info_iterator *iter) 101 + { 102 + if (iter->addr != NULL) { 103 + kunmap_atomic(iter->addr, KM_USER0); 104 + iter->addr = NULL; 105 + } 106 + } 107 + 108 + /* 109 + * get_user_pages() called flush_dcache_page() on the pages for us. 110 + */ 111 + void rds_info_copy(struct rds_info_iterator *iter, void *data, 112 + unsigned long bytes) 113 + { 114 + unsigned long this; 115 + 116 + while (bytes) { 117 + if (iter->addr == NULL) 118 + iter->addr = kmap_atomic(*iter->pages, KM_USER0); 119 + 120 + this = min(bytes, PAGE_SIZE - iter->offset); 121 + 122 + rdsdebug("page %p addr %p offset %lu this %lu data %p " 123 + "bytes %lu\n", *iter->pages, iter->addr, 124 + iter->offset, this, data, bytes); 125 + 126 + memcpy(iter->addr + iter->offset, data, this); 127 + 128 + data += this; 129 + bytes -= this; 130 + iter->offset += this; 131 + 132 + if (iter->offset == PAGE_SIZE) { 133 + kunmap_atomic(iter->addr, KM_USER0); 134 + iter->addr = NULL; 135 + iter->offset = 0; 136 + iter->pages++; 137 + } 138 + } 139 + } 140 + 141 + /* 142 + * @optval points to the userspace buffer that the information snapshot 143 + * will be copied into. 144 + * 145 + * @optlen on input is the size of the buffer in userspace. @optlen 146 + * on output is the size of the requested snapshot in bytes. 147 + * 148 + * This function returns -errno if there is a failure, particularly -ENOSPC 149 + * if the given userspace buffer was not large enough to fit the snapshot. 150 + * On success it returns the positive number of bytes of each array element 151 + * in the snapshot. 152 + */ 153 + int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, 154 + int __user *optlen) 155 + { 156 + struct rds_info_iterator iter; 157 + struct rds_info_lengths lens; 158 + unsigned long nr_pages = 0; 159 + unsigned long start; 160 + unsigned long i; 161 + rds_info_func func; 162 + struct page **pages = NULL; 163 + int ret; 164 + int len; 165 + int total; 166 + 167 + if (get_user(len, optlen)) { 168 + ret = -EFAULT; 169 + goto out; 170 + } 171 + 172 + /* check for all kinds of wrapping and the like */ 173 + start = (unsigned long)optval; 174 + if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) { 175 + ret = -EINVAL; 176 + goto out; 177 + } 178 + 179 + /* a 0 len call is just trying to probe its length */ 180 + if (len == 0) 181 + goto call_func; 182 + 183 + nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) 184 + >> PAGE_SHIFT; 185 + 186 + pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); 187 + if (pages == NULL) { 188 + ret = -ENOMEM; 189 + goto out; 190 + } 191 + down_read(&current->mm->mmap_sem); 192 + ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0, 193 + pages, NULL); 194 + up_read(&current->mm->mmap_sem); 195 + if (ret != nr_pages) { 196 + if (ret > 0) 197 + nr_pages = ret; 198 + else 199 + nr_pages = 0; 200 + ret = -EAGAIN; /* XXX ? */ 201 + goto out; 202 + } 203 + 204 + rdsdebug("len %d nr_pages %lu\n", len, nr_pages); 205 + 206 + call_func: 207 + func = rds_info_funcs[optname - RDS_INFO_FIRST]; 208 + if (func == NULL) { 209 + ret = -ENOPROTOOPT; 210 + goto out; 211 + } 212 + 213 + iter.pages = pages; 214 + iter.addr = NULL; 215 + iter.offset = start & (PAGE_SIZE - 1); 216 + 217 + func(sock, len, &iter, &lens); 218 + BUG_ON(lens.each == 0); 219 + 220 + total = lens.nr * lens.each; 221 + 222 + rds_info_iter_unmap(&iter); 223 + 224 + if (total > len) { 225 + len = total; 226 + ret = -ENOSPC; 227 + } else { 228 + len = total; 229 + ret = lens.each; 230 + } 231 + 232 + if (put_user(len, optlen)) 233 + ret = -EFAULT; 234 + 235 + out: 236 + for (i = 0; pages != NULL && i < nr_pages; i++) 237 + put_page(pages[i]); 238 + kfree(pages); 239 + 240 + return ret; 241 + }
+30
net/rds/info.h
··· 1 + #ifndef _RDS_INFO_H 2 + #define _RDS_INFO_H 3 + 4 + struct rds_info_lengths { 5 + unsigned int nr; 6 + unsigned int each; 7 + }; 8 + 9 + struct rds_info_iterator; 10 + 11 + /* 12 + * These functions must fill in the fields of @lens to reflect the size 13 + * of the available info source. If the snapshot fits in @len then it 14 + * should be copied using @iter. The caller will deduce if it was copied 15 + * or not by comparing the lengths. 16 + */ 17 + typedef void (*rds_info_func)(struct socket *sock, unsigned int len, 18 + struct rds_info_iterator *iter, 19 + struct rds_info_lengths *lens); 20 + 21 + void rds_info_register_func(int optname, rds_info_func func); 22 + void rds_info_deregister_func(int optname, rds_info_func func); 23 + int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, 24 + int __user *optlen); 25 + void rds_info_copy(struct rds_info_iterator *iter, void *data, 26 + unsigned long bytes); 27 + void rds_info_iter_unmap(struct rds_info_iterator *iter); 28 + 29 + 30 + #endif
+148
net/rds/stats.c
··· 1 + /* 2 + * Copyright (c) 2006 Oracle. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the 8 + * OpenIB.org BSD license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or 11 + * without modification, are permitted provided that the following 12 + * conditions are met: 13 + * 14 + * - Redistributions of source code must retain the above 15 + * copyright notice, this list of conditions and the following 16 + * disclaimer. 17 + * 18 + * - Redistributions in binary form must reproduce the above 19 + * copyright notice, this list of conditions and the following 20 + * disclaimer in the documentation and/or other materials 21 + * provided with the distribution. 22 + * 23 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 + * SOFTWARE. 31 + * 32 + */ 33 + #include <linux/percpu.h> 34 + #include <linux/seq_file.h> 35 + #include <linux/proc_fs.h> 36 + 37 + #include "rds.h" 38 + 39 + DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); 40 + 41 + /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ 42 + 43 + static char *rds_stat_names[] = { 44 + "conn_reset", 45 + "recv_drop_bad_checksum", 46 + "recv_drop_old_seq", 47 + "recv_drop_no_sock", 48 + "recv_drop_dead_sock", 49 + "recv_deliver_raced", 50 + "recv_delivered", 51 + "recv_queued", 52 + "recv_immediate_retry", 53 + "recv_delayed_retry", 54 + "recv_ack_required", 55 + "recv_rdma_bytes", 56 + "recv_ping", 57 + "send_queue_empty", 58 + "send_queue_full", 59 + "send_sem_contention", 60 + "send_sem_queue_raced", 61 + "send_immediate_retry", 62 + "send_delayed_retry", 63 + "send_drop_acked", 64 + "send_ack_required", 65 + "send_queued", 66 + "send_rdma", 67 + "send_rdma_bytes", 68 + "send_pong", 69 + "page_remainder_hit", 70 + "page_remainder_miss", 71 + "copy_to_user", 72 + "copy_from_user", 73 + "cong_update_queued", 74 + "cong_update_received", 75 + "cong_send_error", 76 + "cong_send_blocked", 77 + }; 78 + 79 + void rds_stats_info_copy(struct rds_info_iterator *iter, 80 + uint64_t *values, char **names, size_t nr) 81 + { 82 + struct rds_info_counter ctr; 83 + size_t i; 84 + 85 + for (i = 0; i < nr; i++) { 86 + BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); 87 + strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); 88 + ctr.value = values[i]; 89 + 90 + rds_info_copy(iter, &ctr, sizeof(ctr)); 91 + } 92 + } 93 + 94 + /* 95 + * This gives global counters across all the transports. The strings 96 + * are copied in so that the tool doesn't need knowledge of the specific 97 + * stats that we're exporting. Some are pretty implementation dependent 98 + * and may change over time. That doesn't stop them from being useful. 99 + * 100 + * This is the only function in the chain that knows about the byte granular 101 + * length in userspace. It converts it to number of stat entries that the 102 + * rest of the functions operate in. 103 + */ 104 + static void rds_stats_info(struct socket *sock, unsigned int len, 105 + struct rds_info_iterator *iter, 106 + struct rds_info_lengths *lens) 107 + { 108 + struct rds_statistics stats = {0, }; 109 + uint64_t *src; 110 + uint64_t *sum; 111 + size_t i; 112 + int cpu; 113 + unsigned int avail; 114 + 115 + avail = len / sizeof(struct rds_info_counter); 116 + 117 + if (avail < ARRAY_SIZE(rds_stat_names)) { 118 + avail = 0; 119 + goto trans; 120 + } 121 + 122 + for_each_online_cpu(cpu) { 123 + src = (uint64_t *)&(per_cpu(rds_stats, cpu)); 124 + sum = (uint64_t *)&stats; 125 + for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) 126 + *(sum++) += *(src++); 127 + } 128 + 129 + rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, 130 + ARRAY_SIZE(rds_stat_names)); 131 + avail -= ARRAY_SIZE(rds_stat_names); 132 + 133 + trans: 134 + lens->each = sizeof(struct rds_info_counter); 135 + lens->nr = rds_trans_stats_info_copy(iter, avail) + 136 + ARRAY_SIZE(rds_stat_names); 137 + } 138 + 139 + void rds_stats_exit(void) 140 + { 141 + rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); 142 + } 143 + 144 + int __init rds_stats_init(void) 145 + { 146 + rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); 147 + return 0; 148 + }