at v3.4-rc5 276 lines 9.0 kB view raw
1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <linux/string.h> 16#include <linux/smp.h> 17#include <linux/module.h> 18#include <linux/uaccess.h> 19#include <asm/fixmap.h> 20#include <asm/kmap_types.h> 21#include <asm/tlbflush.h> 22#include <hv/hypervisor.h> 23#include <arch/chip.h> 24 25 26#if !CHIP_HAS_COHERENT_LOCAL_CACHE() 27 28/* Defined in memcpy.S */ 29extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); 30extern unsigned long __copy_to_user_inatomic_asm( 31 void __user *to, const void *from, unsigned long n); 32extern unsigned long __copy_from_user_inatomic_asm( 33 void *to, const void __user *from, unsigned long n); 34extern unsigned long __copy_from_user_zeroing_asm( 35 void *to, const void __user *from, unsigned long n); 36 37typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); 38 39/* Size above which to consider TLB games for performance */ 40#define LARGE_COPY_CUTOFF 2048 41 42/* Communicate to the simulator what we are trying to do. */ 43#define sim_allow_multiple_caching(b) \ 44 __insn_mtspr(SPR_SIM_CONTROL, \ 45 SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) 46 47/* 48 * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. 49 * 50 * We set up our own source and destination PTEs that we fully control. 51 * This is the only way to guarantee that we don't race with another 52 * thread that is modifying the PTE; we can't afford to try the 53 * copy_{to,from}_user() technique of catching the interrupt, since 54 * we must run with interrupts disabled to avoid the risk of some 55 * other code seeing the incoherent data in our cache. (Recall that 56 * our cache is indexed by PA, so even if the other code doesn't use 57 * our kmap_atomic virtual addresses, they'll still hit in cache using 58 * the normal VAs that aren't supposed to hit in cache.) 59 */ 60static void memcpy_multicache(void *dest, const void *source, 61 pte_t dst_pte, pte_t src_pte, int len) 62{ 63 int idx; 64 unsigned long flags, newsrc, newdst; 65 pmd_t *pmdp; 66 pte_t *ptep; 67 int type0, type1; 68 int cpu = get_cpu(); 69 70 /* 71 * Disable interrupts so that we don't recurse into memcpy() 72 * in an interrupt handler, nor accidentally reference 73 * the PA of the source from an interrupt routine. Also 74 * notify the simulator that we're playing games so we don't 75 * generate spurious coherency warnings. 76 */ 77 local_irq_save(flags); 78 sim_allow_multiple_caching(1); 79 80 /* Set up the new dest mapping */ 81 type0 = kmap_atomic_idx_push(); 82 idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0; 83 newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); 84 pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); 85 ptep = pte_offset_kernel(pmdp, newdst); 86 if (pte_val(*ptep) != pte_val(dst_pte)) { 87 set_pte(ptep, dst_pte); 88 local_flush_tlb_page(NULL, newdst, PAGE_SIZE); 89 } 90 91 /* Set up the new source mapping */ 92 type1 = kmap_atomic_idx_push(); 93 idx += (type0 - type1); 94 src_pte = hv_pte_set_nc(src_pte); 95 src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ 96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); 97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); 98 ptep = pte_offset_kernel(pmdp, newsrc); 99 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 101 102 /* Actually move the data. */ 103 __memcpy_asm((void *)newdst, (const void *)newsrc, len); 104 105 /* 106 * Remap the source as locally-cached and not OLOC'ed so that 107 * we can inval without also invaling the remote cpu's cache. 108 * This also avoids known errata with inv'ing cacheable oloc data. 109 */ 110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); 111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ 112 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ 113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 114 115 /* 116 * Do the actual invalidation, covering the full L2 cache line 117 * at the end since __memcpy_asm() is somewhat aggressive. 118 */ 119 __inv_buffer((void *)newsrc, len); 120 121 /* 122 * We're done: notify the simulator that all is back to normal, 123 * and re-enable interrupts and pre-emption. 124 */ 125 kmap_atomic_idx_pop(); 126 kmap_atomic_idx_pop(); 127 sim_allow_multiple_caching(0); 128 local_irq_restore(flags); 129 put_cpu(); 130} 131 132/* 133 * Identify large copies from remotely-cached memory, and copy them 134 * via memcpy_multicache() if they look good, otherwise fall back 135 * to the particular kind of copying passed as the memcpy_t function. 136 */ 137static unsigned long fast_copy(void *dest, const void *source, int len, 138 memcpy_t func) 139{ 140 /* 141 * Check if it's big enough to bother with. We may end up doing a 142 * small copy via TLB manipulation if we're near a page boundary, 143 * but presumably we'll make it up when we hit the second page. 144 */ 145 while (len >= LARGE_COPY_CUTOFF) { 146 int copy_size, bytes_left_on_page; 147 pte_t *src_ptep, *dst_ptep; 148 pte_t src_pte, dst_pte; 149 struct page *src_page, *dst_page; 150 151 /* Is the source page oloc'ed to a remote cpu? */ 152retry_source: 153 src_ptep = virt_to_pte(current->mm, (unsigned long)source); 154 if (src_ptep == NULL) 155 break; 156 src_pte = *src_ptep; 157 if (!hv_pte_get_present(src_pte) || 158 !hv_pte_get_readable(src_pte) || 159 hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) 160 break; 161 if (get_remote_cache_cpu(src_pte) == smp_processor_id()) 162 break; 163 src_page = pfn_to_page(hv_pte_get_pfn(src_pte)); 164 get_page(src_page); 165 if (pte_val(src_pte) != pte_val(*src_ptep)) { 166 put_page(src_page); 167 goto retry_source; 168 } 169 if (pte_huge(src_pte)) { 170 /* Adjust the PTE to correspond to a small page */ 171 int pfn = hv_pte_get_pfn(src_pte); 172 pfn += (((unsigned long)source & (HPAGE_SIZE-1)) 173 >> PAGE_SHIFT); 174 src_pte = pfn_pte(pfn, src_pte); 175 src_pte = pte_mksmall(src_pte); 176 } 177 178 /* Is the destination page writable? */ 179retry_dest: 180 dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); 181 if (dst_ptep == NULL) { 182 put_page(src_page); 183 break; 184 } 185 dst_pte = *dst_ptep; 186 if (!hv_pte_get_present(dst_pte) || 187 !hv_pte_get_writable(dst_pte)) { 188 put_page(src_page); 189 break; 190 } 191 dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte)); 192 if (dst_page == src_page) { 193 /* 194 * Source and dest are on the same page; this 195 * potentially exposes us to incoherence if any 196 * part of src and dest overlap on a cache line. 197 * Just give up rather than trying to be precise. 198 */ 199 put_page(src_page); 200 break; 201 } 202 get_page(dst_page); 203 if (pte_val(dst_pte) != pte_val(*dst_ptep)) { 204 put_page(dst_page); 205 goto retry_dest; 206 } 207 if (pte_huge(dst_pte)) { 208 /* Adjust the PTE to correspond to a small page */ 209 int pfn = hv_pte_get_pfn(dst_pte); 210 pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) 211 >> PAGE_SHIFT); 212 dst_pte = pfn_pte(pfn, dst_pte); 213 dst_pte = pte_mksmall(dst_pte); 214 } 215 216 /* All looks good: create a cachable PTE and copy from it */ 217 copy_size = len; 218 bytes_left_on_page = 219 PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); 220 if (copy_size > bytes_left_on_page) 221 copy_size = bytes_left_on_page; 222 bytes_left_on_page = 223 PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); 224 if (copy_size > bytes_left_on_page) 225 copy_size = bytes_left_on_page; 226 memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); 227 228 /* Release the pages */ 229 put_page(dst_page); 230 put_page(src_page); 231 232 /* Continue on the next page */ 233 dest += copy_size; 234 source += copy_size; 235 len -= copy_size; 236 } 237 238 return func(dest, source, len); 239} 240 241void *memcpy(void *to, const void *from, __kernel_size_t n) 242{ 243 if (n < LARGE_COPY_CUTOFF) 244 return (void *)__memcpy_asm(to, from, n); 245 else 246 return (void *)fast_copy(to, from, n, __memcpy_asm); 247} 248 249unsigned long __copy_to_user_inatomic(void __user *to, const void *from, 250 unsigned long n) 251{ 252 if (n < LARGE_COPY_CUTOFF) 253 return __copy_to_user_inatomic_asm(to, from, n); 254 else 255 return fast_copy(to, from, n, __copy_to_user_inatomic_asm); 256} 257 258unsigned long __copy_from_user_inatomic(void *to, const void __user *from, 259 unsigned long n) 260{ 261 if (n < LARGE_COPY_CUTOFF) 262 return __copy_from_user_inatomic_asm(to, from, n); 263 else 264 return fast_copy(to, from, n, __copy_from_user_inatomic_asm); 265} 266 267unsigned long __copy_from_user_zeroing(void *to, const void __user *from, 268 unsigned long n) 269{ 270 if (n < LARGE_COPY_CUTOFF) 271 return __copy_from_user_zeroing_asm(to, from, n); 272 else 273 return fast_copy(to, from, n, __copy_from_user_zeroing_asm); 274} 275 276#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */