Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Document the vDSO and add a reference parser

It turns out that parsing the vDSO is nontrivial if you don't already
have an ELF dynamic loader around. So document it in Documentation/ABI
and add a reference CC0-licenced parser.

This code is dedicated to Go issue 1933:
http://code.google.com/p/go/issues/detail?id=1933

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/a315a9514cd71bcf29436cc31e35aada21a5ff21.1310563276.git.luto@mit.edu
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

authored by

Andy Lutomirski and committed by
H. Peter Anvin
98eedc3a 574c44fa

+394
+27
Documentation/ABI/stable/vdso
··· 1 + On some architectures, when the kernel loads any userspace program it 2 + maps an ELF DSO into that program's address space. This DSO is called 3 + the vDSO and it often contains useful and highly-optimized alternatives 4 + to real syscalls. 5 + 6 + These functions are called just like ordinary C function according to 7 + your platform's ABI. Call them from a sensible context. (For example, 8 + if you set CS on x86 to something strange, the vDSO functions are 9 + within their rights to crash.) In addition, if you pass a bad 10 + pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT. 11 + 12 + To find the DSO, parse the auxiliary vector passed to the program's 13 + entry point. The AT_SYSINFO_EHDR entry will point to the vDSO. 14 + 15 + The vDSO uses symbol versioning; whenever you request a symbol from the 16 + vDSO, specify the version you are expecting. 17 + 18 + Programs that dynamically link to glibc will use the vDSO automatically. 19 + Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c. 20 + 21 + Unless otherwise noted, the set of symbols with any given version and the 22 + ABI of those symbols is considered stable. It may vary across architectures, 23 + though. 24 + 25 + (As of this writing, this ABI documentation as been confirmed for x86_64. 26 + The maintainers of the other vDSO-using architectures should confirm 27 + that it is correct for their architecture.)
+256
Documentation/vDSO/parse_vdso.c
··· 1 + /* 2 + * parse_vdso.c: Linux reference vDSO parser 3 + * Written by Andrew Lutomirski, 2011. 4 + * 5 + * This code is meant to be linked in to various programs that run on Linux. 6 + * As such, it is available with as few restrictions as possible. This file 7 + * is licensed under the Creative Commons Zero License, version 1.0, 8 + * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode 9 + * 10 + * The vDSO is a regular ELF DSO that the kernel maps into user space when 11 + * it starts a program. It works equally well in statically and dynamically 12 + * linked binaries. 13 + * 14 + * This code is tested on x86_64. In principle it should work on any 64-bit 15 + * architecture that has a vDSO. 16 + */ 17 + 18 + #include <stdbool.h> 19 + #include <stdint.h> 20 + #include <string.h> 21 + #include <elf.h> 22 + 23 + /* 24 + * To use this vDSO parser, first call one of the vdso_init_* functions. 25 + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR 26 + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. 27 + * Then call vdso_sym for each symbol you want. For example, to look up 28 + * gettimeofday on x86_64, use: 29 + * 30 + * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); 31 + * or 32 + * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); 33 + * 34 + * vdso_sym will return 0 if the symbol doesn't exist or if the init function 35 + * failed or was not called. vdso_sym is a little slow, so its return value 36 + * should be cached. 37 + * 38 + * vdso_sym is threadsafe; the init functions are not. 39 + * 40 + * These are the prototypes: 41 + */ 42 + extern void vdso_init_from_auxv(void *auxv); 43 + extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); 44 + extern void *vdso_sym(const char *version, const char *name); 45 + 46 + 47 + /* And here's the code. */ 48 + 49 + #ifndef __x86_64__ 50 + # error Not yet ported to non-x86_64 architectures 51 + #endif 52 + 53 + static struct vdso_info 54 + { 55 + bool valid; 56 + 57 + /* Load information */ 58 + uintptr_t load_addr; 59 + uintptr_t load_offset; /* load_addr - recorded vaddr */ 60 + 61 + /* Symbol table */ 62 + Elf64_Sym *symtab; 63 + const char *symstrings; 64 + Elf64_Word *bucket, *chain; 65 + Elf64_Word nbucket, nchain; 66 + 67 + /* Version table */ 68 + Elf64_Versym *versym; 69 + Elf64_Verdef *verdef; 70 + } vdso_info; 71 + 72 + /* Straight from the ELF specification. */ 73 + static unsigned long elf_hash(const unsigned char *name) 74 + { 75 + unsigned long h = 0, g; 76 + while (*name) 77 + { 78 + h = (h << 4) + *name++; 79 + if (g = h & 0xf0000000) 80 + h ^= g >> 24; 81 + h &= ~g; 82 + } 83 + return h; 84 + } 85 + 86 + void vdso_init_from_sysinfo_ehdr(uintptr_t base) 87 + { 88 + size_t i; 89 + bool found_vaddr = false; 90 + 91 + vdso_info.valid = false; 92 + 93 + vdso_info.load_addr = base; 94 + 95 + Elf64_Ehdr *hdr = (Elf64_Ehdr*)base; 96 + Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff); 97 + Elf64_Dyn *dyn = 0; 98 + 99 + /* 100 + * We need two things from the segment table: the load offset 101 + * and the dynamic table. 102 + */ 103 + for (i = 0; i < hdr->e_phnum; i++) 104 + { 105 + if (pt[i].p_type == PT_LOAD && !found_vaddr) { 106 + found_vaddr = true; 107 + vdso_info.load_offset = base 108 + + (uintptr_t)pt[i].p_offset 109 + - (uintptr_t)pt[i].p_vaddr; 110 + } else if (pt[i].p_type == PT_DYNAMIC) { 111 + dyn = (Elf64_Dyn*)(base + pt[i].p_offset); 112 + } 113 + } 114 + 115 + if (!found_vaddr || !dyn) 116 + return; /* Failed */ 117 + 118 + /* 119 + * Fish out the useful bits of the dynamic table. 120 + */ 121 + Elf64_Word *hash = 0; 122 + vdso_info.symstrings = 0; 123 + vdso_info.symtab = 0; 124 + vdso_info.versym = 0; 125 + vdso_info.verdef = 0; 126 + for (i = 0; dyn[i].d_tag != DT_NULL; i++) { 127 + switch (dyn[i].d_tag) { 128 + case DT_STRTAB: 129 + vdso_info.symstrings = (const char *) 130 + ((uintptr_t)dyn[i].d_un.d_ptr 131 + + vdso_info.load_offset); 132 + break; 133 + case DT_SYMTAB: 134 + vdso_info.symtab = (Elf64_Sym *) 135 + ((uintptr_t)dyn[i].d_un.d_ptr 136 + + vdso_info.load_offset); 137 + break; 138 + case DT_HASH: 139 + hash = (Elf64_Word *) 140 + ((uintptr_t)dyn[i].d_un.d_ptr 141 + + vdso_info.load_offset); 142 + break; 143 + case DT_VERSYM: 144 + vdso_info.versym = (Elf64_Versym *) 145 + ((uintptr_t)dyn[i].d_un.d_ptr 146 + + vdso_info.load_offset); 147 + break; 148 + case DT_VERDEF: 149 + vdso_info.verdef = (Elf64_Verdef *) 150 + ((uintptr_t)dyn[i].d_un.d_ptr 151 + + vdso_info.load_offset); 152 + break; 153 + } 154 + } 155 + if (!vdso_info.symstrings || !vdso_info.symtab || !hash) 156 + return; /* Failed */ 157 + 158 + if (!vdso_info.verdef) 159 + vdso_info.versym = 0; 160 + 161 + /* Parse the hash table header. */ 162 + vdso_info.nbucket = hash[0]; 163 + vdso_info.nchain = hash[1]; 164 + vdso_info.bucket = &hash[2]; 165 + vdso_info.chain = &hash[vdso_info.nbucket + 2]; 166 + 167 + /* That's all we need. */ 168 + vdso_info.valid = true; 169 + } 170 + 171 + static bool vdso_match_version(Elf64_Versym ver, 172 + const char *name, Elf64_Word hash) 173 + { 174 + /* 175 + * This is a helper function to check if the version indexed by 176 + * ver matches name (which hashes to hash). 177 + * 178 + * The version definition table is a mess, and I don't know how 179 + * to do this in better than linear time without allocating memory 180 + * to build an index. I also don't know why the table has 181 + * variable size entries in the first place. 182 + * 183 + * For added fun, I can't find a comprehensible specification of how 184 + * to parse all the weird flags in the table. 185 + * 186 + * So I just parse the whole table every time. 187 + */ 188 + 189 + /* First step: find the version definition */ 190 + ver &= 0x7fff; /* Apparently bit 15 means "hidden" */ 191 + Elf64_Verdef *def = vdso_info.verdef; 192 + while(true) { 193 + if ((def->vd_flags & VER_FLG_BASE) == 0 194 + && (def->vd_ndx & 0x7fff) == ver) 195 + break; 196 + 197 + if (def->vd_next == 0) 198 + return false; /* No definition. */ 199 + 200 + def = (Elf64_Verdef *)((char *)def + def->vd_next); 201 + } 202 + 203 + /* Now figure out whether it matches. */ 204 + Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux); 205 + return def->vd_hash == hash 206 + && !strcmp(name, vdso_info.symstrings + aux->vda_name); 207 + } 208 + 209 + void *vdso_sym(const char *version, const char *name) 210 + { 211 + unsigned long ver_hash; 212 + if (!vdso_info.valid) 213 + return 0; 214 + 215 + ver_hash = elf_hash(version); 216 + Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; 217 + 218 + for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { 219 + Elf64_Sym *sym = &vdso_info.symtab[chain]; 220 + 221 + /* Check for a defined global or weak function w/ right name. */ 222 + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) 223 + continue; 224 + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && 225 + ELF64_ST_BIND(sym->st_info) != STB_WEAK) 226 + continue; 227 + if (sym->st_shndx == SHN_UNDEF) 228 + continue; 229 + if (strcmp(name, vdso_info.symstrings + sym->st_name)) 230 + continue; 231 + 232 + /* Check symbol version. */ 233 + if (vdso_info.versym 234 + && !vdso_match_version(vdso_info.versym[chain], 235 + version, ver_hash)) 236 + continue; 237 + 238 + return (void *)(vdso_info.load_offset + sym->st_value); 239 + } 240 + 241 + return 0; 242 + } 243 + 244 + void vdso_init_from_auxv(void *auxv) 245 + { 246 + Elf64_auxv_t *elf_auxv = auxv; 247 + for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) 248 + { 249 + if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { 250 + vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); 251 + return; 252 + } 253 + } 254 + 255 + vdso_info.valid = false; 256 + }
+111
Documentation/vDSO/vdso_test.c
··· 1 + /* 2 + * vdso_test.c: Sample code to test parse_vdso.c on x86_64 3 + * Copyright (c) 2011 Andy Lutomirski 4 + * Subject to the GNU General Public License, version 2 5 + * 6 + * You can amuse yourself by compiling with: 7 + * gcc -std=gnu99 -nostdlib 8 + * -Os -fno-asynchronous-unwind-tables -flto 9 + * vdso_test.c parse_vdso.c -o vdso_test 10 + * to generate a small binary with no dependencies at all. 11 + */ 12 + 13 + #include <sys/syscall.h> 14 + #include <sys/time.h> 15 + #include <unistd.h> 16 + #include <stdint.h> 17 + 18 + extern void *vdso_sym(const char *version, const char *name); 19 + extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); 20 + extern void vdso_init_from_auxv(void *auxv); 21 + 22 + /* We need a libc functions... */ 23 + int strcmp(const char *a, const char *b) 24 + { 25 + /* This implementation is buggy: it never returns -1. */ 26 + while (*a || *b) { 27 + if (*a != *b) 28 + return 1; 29 + if (*a == 0 || *b == 0) 30 + return 1; 31 + a++; 32 + b++; 33 + } 34 + 35 + return 0; 36 + } 37 + 38 + /* ...and two syscalls. This is x86_64-specific. */ 39 + static inline long linux_write(int fd, const void *data, size_t len) 40 + { 41 + 42 + long ret; 43 + asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write), 44 + "D" (fd), "S" (data), "d" (len) : 45 + "cc", "memory", "rcx", 46 + "r8", "r9", "r10", "r11" ); 47 + return ret; 48 + } 49 + 50 + static inline void linux_exit(int code) 51 + { 52 + asm volatile ("syscall" : : "a" (__NR_exit), "D" (code)); 53 + } 54 + 55 + void to_base10(char *lastdig, uint64_t n) 56 + { 57 + while (n) { 58 + *lastdig = (n % 10) + '0'; 59 + n /= 10; 60 + lastdig--; 61 + } 62 + } 63 + 64 + __attribute__((externally_visible)) void c_main(void **stack) 65 + { 66 + /* Parse the stack */ 67 + long argc = (long)*stack; 68 + stack += argc + 2; 69 + 70 + /* Now we're pointing at the environment. Skip it. */ 71 + while(*stack) 72 + stack++; 73 + stack++; 74 + 75 + /* Now we're pointing at auxv. Initialize the vDSO parser. */ 76 + vdso_init_from_auxv((void *)stack); 77 + 78 + /* Find gettimeofday. */ 79 + typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); 80 + gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); 81 + 82 + if (!gtod) 83 + linux_exit(1); 84 + 85 + struct timeval tv; 86 + long ret = gtod(&tv, 0); 87 + 88 + if (ret == 0) { 89 + char buf[] = "The time is .000000\n"; 90 + to_base10(buf + 31, tv.tv_sec); 91 + to_base10(buf + 38, tv.tv_usec); 92 + linux_write(1, buf, sizeof(buf) - 1); 93 + } else { 94 + linux_exit(ret); 95 + } 96 + 97 + linux_exit(0); 98 + } 99 + 100 + /* 101 + * This is the real entry point. It passes the initial stack into 102 + * the C entry point. 103 + */ 104 + asm ( 105 + ".text\n" 106 + ".global _start\n" 107 + ".type _start,@function\n" 108 + "_start:\n\t" 109 + "mov %rsp,%rdi\n\t" 110 + "jmp c_main" 111 + );