random: make /dev/urandom scalable for silly userspace programs

On a system with a 4 socket (NUMA) system where a large number of
application threads were all trying to read from /dev/urandom, this
can result in the system spending 80% of its time contending on the
global urandom spinlock. The application should have used its own
PRNG, but let's try to help it from running, lemming-like, straight
over the locking cliff.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

+58 -4
+58 -4
drivers/char/random.c
··· 436 #define crng_ready() (likely(crng_init > 0)) 437 static int crng_init_cnt = 0; 438 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE) 439 static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]); 440 static void process_random_ready_list(void); 441 ··· 758 759 static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); 760 761 static void crng_initialize(struct crng_state *crng) 762 { 763 int i; ··· 827 if (num == 0) 828 return; 829 } else 830 - extract_crng(buf.block); 831 spin_lock_irqsave(&primary_crng.lock, flags); 832 for (i = 0; i < 8; i++) { 833 unsigned long rv; ··· 847 spin_unlock_irqrestore(&primary_crng.lock, flags); 848 } 849 850 static inline void crng_wait_ready(void) 851 { 852 wait_event_interruptible(crng_init_wait, crng_ready()); 853 } 854 855 - static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) 856 { 857 unsigned long v, flags; 858 - struct crng_state *crng = &primary_crng; 859 860 if (crng_init > 1 && 861 time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) 862 - crng_reseed(crng, &input_pool); 863 spin_lock_irqsave(&crng->lock, flags); 864 if (arch_get_random_long(&v)) 865 crng->state[14] ^= v; ··· 874 if (crng->state[12] == 0) 875 crng->state[13]++; 876 spin_unlock_irqrestore(&crng->lock, flags); 877 } 878 879 static ssize_t extract_crng_user(void __user *buf, size_t nbytes) ··· 1607 */ 1608 static int rand_initialize(void) 1609 { 1610 init_std_data(&input_pool); 1611 init_std_data(&blocking_pool); 1612 crng_initialize(&primary_crng); 1613 return 0; 1614 } 1615 early_initcall(rand_initialize);
··· 436 #define crng_ready() (likely(crng_init > 0)) 437 static int crng_init_cnt = 0; 438 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE) 439 + static void _extract_crng(struct crng_state *crng, 440 + __u8 out[CHACHA20_BLOCK_SIZE]); 441 static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]); 442 static void process_random_ready_list(void); 443 ··· 756 757 static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); 758 759 + #ifdef CONFIG_NUMA 760 + /* 761 + * Hack to deal with crazy userspace progams when they are all trying 762 + * to access /dev/urandom in parallel. The programs are almost 763 + * certainly doing something terribly wrong, but we'll work around 764 + * their brain damage. 765 + */ 766 + static struct crng_state **crng_node_pool __read_mostly; 767 + #endif 768 + 769 static void crng_initialize(struct crng_state *crng) 770 { 771 int i; ··· 815 if (num == 0) 816 return; 817 } else 818 + _extract_crng(&primary_crng, buf.block); 819 spin_lock_irqsave(&primary_crng.lock, flags); 820 for (i = 0; i < 8; i++) { 821 unsigned long rv; ··· 835 spin_unlock_irqrestore(&primary_crng.lock, flags); 836 } 837 838 + static inline void maybe_reseed_primary_crng(void) 839 + { 840 + if (crng_init > 2 && 841 + time_after(jiffies, primary_crng.init_time + CRNG_RESEED_INTERVAL)) 842 + crng_reseed(&primary_crng, &input_pool); 843 + } 844 + 845 static inline void crng_wait_ready(void) 846 { 847 wait_event_interruptible(crng_init_wait, crng_ready()); 848 } 849 850 + static void _extract_crng(struct crng_state *crng, 851 + __u8 out[CHACHA20_BLOCK_SIZE]) 852 { 853 unsigned long v, flags; 854 855 if (crng_init > 1 && 856 time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) 857 + crng_reseed(crng, crng == &primary_crng ? &input_pool : NULL); 858 spin_lock_irqsave(&crng->lock, flags); 859 if (arch_get_random_long(&v)) 860 crng->state[14] ^= v; ··· 855 if (crng->state[12] == 0) 856 crng->state[13]++; 857 spin_unlock_irqrestore(&crng->lock, flags); 858 + } 859 + 860 + static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) 861 + { 862 + struct crng_state *crng = NULL; 863 + 864 + #ifdef CONFIG_NUMA 865 + if (crng_node_pool) 866 + crng = crng_node_pool[numa_node_id()]; 867 + if (crng == NULL) 868 + #endif 869 + crng = &primary_crng; 870 + _extract_crng(crng, out); 871 } 872 873 static ssize_t extract_crng_user(void __user *buf, size_t nbytes) ··· 1575 */ 1576 static int rand_initialize(void) 1577 { 1578 + #ifdef CONFIG_NUMA 1579 + int i; 1580 + int num_nodes = num_possible_nodes(); 1581 + struct crng_state *crng; 1582 + struct crng_state **pool; 1583 + #endif 1584 + 1585 init_std_data(&input_pool); 1586 init_std_data(&blocking_pool); 1587 crng_initialize(&primary_crng); 1588 + 1589 + #ifdef CONFIG_NUMA 1590 + pool = kmalloc(num_nodes * sizeof(void *), 1591 + GFP_KERNEL|__GFP_NOFAIL|__GFP_ZERO); 1592 + for (i=0; i < num_nodes; i++) { 1593 + crng = kmalloc_node(sizeof(struct crng_state), 1594 + GFP_KERNEL | __GFP_NOFAIL, i); 1595 + spin_lock_init(&crng->lock); 1596 + crng_initialize(crng); 1597 + pool[i] = crng; 1598 + 1599 + } 1600 + mb(); 1601 + crng_node_pool = pool; 1602 + #endif 1603 return 0; 1604 } 1605 early_initcall(rand_initialize);