Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sysctl: allow for strict write position handling

When writing to a sysctl string, each write, regardless of VFS position,
begins writing the string from the start. This means the contents of
the last write to the sysctl controls the string contents instead of the
first:

open("/proc/sys/kernel/modprobe", O_WRONLY) = 1
write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 4096) = 4096
write(1, "/bin/true", 9) = 9
close(1) = 0

$ cat /proc/sys/kernel/modprobe
/bin/true

Expected behaviour would be to have the sysctl be "AAAA..." capped at
maxlen (in this case KMOD_PATH_LEN: 256), instead of truncating to the
contents of the second write. Similarly, multiple short writes would
not append to the sysctl.

The old behavior is unlike regular POSIX files enough that doing audits
of software that interact with sysctls can end up in unexpected or
dangerous situations. For example, "as long as the input starts with a
trusted path" turns out to be an insufficient filter, as what must also
happen is for the input to be entirely contained in a single write
syscall -- not a common consideration, especially for high level tools.

This provides kernel.sysctl_writes_strict as a way to make this behavior
act in a less surprising manner for strings, and disallows non-zero file
position when writing numeric sysctls (similar to what is already done
when reading from non-zero file positions). For now, the default (0) is
to warn about non-zero file position use, but retain the legacy
behavior. Setting this to -1 disables the warning, and setting this to
1 enables the file position respecting behavior.

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: move misplaced hunk, per Randy]
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Kees Cook and committed by
Linus Torvalds
f4aacea2 2ca9bb45

+88 -2
+21
Documentation/sysctl/kernel.txt
··· 77 77 - shmmni 78 78 - stop-a [ SPARC only ] 79 79 - sysrq ==> Documentation/sysrq.txt 80 + - sysctl_writes_strict 80 81 - tainted 81 82 - threads-max 82 83 - unknown_nmi_panic ··· 760 759 761 760 Note that if you change this from 0 to 1, already created segments 762 761 without users and with a dead originative process will be destroyed. 762 + 763 + ============================================================== 764 + 765 + sysctl_writes_strict: 766 + 767 + Control how file position affects the behavior of updating sysctl values 768 + via the /proc/sys interface: 769 + 770 + -1 - Legacy per-write sysctl value handling, with no printk warnings. 771 + Each write syscall must fully contain the sysctl value to be 772 + written, and multiple writes on the same sysctl file descriptor 773 + will rewrite the sysctl value, regardless of file position. 774 + 0 - (default) Same behavior as above, but warn about processes that 775 + perform writes to a sysctl file descriptor when the file position 776 + is not 0. 777 + 1 - Respect file position when writing sysctl strings. Multiple writes 778 + will append to the sysctl value buffer. Anything past the max length 779 + of the sysctl value buffer will be ignored. Writes to numeric sysctl 780 + entries must always be at file position 0 and the value must be 781 + fully contained in the buffer sent in the write syscall. 763 782 764 783 ============================================================== 765 784
+67 -2
kernel/sysctl.c
··· 173 173 #endif 174 174 175 175 #ifdef CONFIG_PROC_SYSCTL 176 + 177 + #define SYSCTL_WRITES_LEGACY -1 178 + #define SYSCTL_WRITES_WARN 0 179 + #define SYSCTL_WRITES_STRICT 1 180 + 181 + static int sysctl_writes_strict = SYSCTL_WRITES_WARN; 182 + 176 183 static int proc_do_cad_pid(struct ctl_table *table, int write, 177 184 void __user *buffer, size_t *lenp, loff_t *ppos); 178 185 static int proc_taint(struct ctl_table *table, int write, ··· 501 494 .maxlen = sizeof(long), 502 495 .mode = 0644, 503 496 .proc_handler = proc_taint, 497 + }, 498 + { 499 + .procname = "sysctl_writes_strict", 500 + .data = &sysctl_writes_strict, 501 + .maxlen = sizeof(int), 502 + .mode = 0644, 503 + .proc_handler = proc_dointvec_minmax, 504 + .extra1 = &neg_one, 505 + .extra2 = &one, 504 506 }, 505 507 #endif 506 508 #ifdef CONFIG_LATENCYTOP ··· 1733 1717 } 1734 1718 1735 1719 if (write) { 1736 - /* Start writing from beginning of buffer. */ 1737 - len = 0; 1720 + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { 1721 + /* Only continue writes not past the end of buffer. */ 1722 + len = strlen(data); 1723 + if (len > maxlen - 1) 1724 + len = maxlen - 1; 1725 + 1726 + if (*ppos > len) 1727 + return 0; 1728 + len = *ppos; 1729 + } else { 1730 + /* Start writing from beginning of buffer. */ 1731 + len = 0; 1732 + } 1733 + 1738 1734 *ppos += *lenp; 1739 1735 p = buffer; 1740 1736 while ((p - buffer) < *lenp && len < maxlen - 1) { ··· 1786 1758 return 0; 1787 1759 } 1788 1760 1761 + static void warn_sysctl_write(struct ctl_table *table) 1762 + { 1763 + pr_warn_once("%s wrote to %s when file position was not 0!\n" 1764 + "This will not be supported in the future. To silence this\n" 1765 + "warning, set kernel.sysctl_writes_strict = -1\n", 1766 + current->comm, table->procname); 1767 + } 1768 + 1789 1769 /** 1790 1770 * proc_dostring - read a string sysctl 1791 1771 * @table: the sysctl table ··· 1814 1778 int proc_dostring(struct ctl_table *table, int write, 1815 1779 void __user *buffer, size_t *lenp, loff_t *ppos) 1816 1780 { 1781 + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) 1782 + warn_sysctl_write(table); 1783 + 1817 1784 return _proc_do_string((char *)(table->data), table->maxlen, write, 1818 1785 (char __user *)buffer, lenp, ppos); 1819 1786 } ··· 1992 1953 conv = do_proc_dointvec_conv; 1993 1954 1994 1955 if (write) { 1956 + if (*ppos) { 1957 + switch (sysctl_writes_strict) { 1958 + case SYSCTL_WRITES_STRICT: 1959 + goto out; 1960 + case SYSCTL_WRITES_WARN: 1961 + warn_sysctl_write(table); 1962 + break; 1963 + default: 1964 + break; 1965 + } 1966 + } 1967 + 1995 1968 if (left > PAGE_SIZE - 1) 1996 1969 left = PAGE_SIZE - 1; 1997 1970 page = __get_free_page(GFP_TEMPORARY); ··· 2061 2010 return err ? : -EINVAL; 2062 2011 } 2063 2012 *lenp -= left; 2013 + out: 2064 2014 *ppos += *lenp; 2065 2015 return err; 2066 2016 } ··· 2254 2202 left = *lenp; 2255 2203 2256 2204 if (write) { 2205 + if (*ppos) { 2206 + switch (sysctl_writes_strict) { 2207 + case SYSCTL_WRITES_STRICT: 2208 + goto out; 2209 + case SYSCTL_WRITES_WARN: 2210 + warn_sysctl_write(table); 2211 + break; 2212 + default: 2213 + break; 2214 + } 2215 + } 2216 + 2257 2217 if (left > PAGE_SIZE - 1) 2258 2218 left = PAGE_SIZE - 1; 2259 2219 page = __get_free_page(GFP_TEMPORARY); ··· 2321 2257 return err ? : -EINVAL; 2322 2258 } 2323 2259 *lenp -= left; 2260 + out: 2324 2261 *ppos += *lenp; 2325 2262 return err; 2326 2263 }