memcpy_tile64.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. /*
  2. * Copyright 2010 Tilera Corporation. All Rights Reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation, version 2.
  7. *
  8. * This program is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11. * NON INFRINGEMENT. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/string.h>
  15. #include <linux/smp.h>
  16. #include <linux/module.h>
  17. #include <linux/uaccess.h>
  18. #include <asm/fixmap.h>
  19. #include <asm/kmap_types.h>
  20. #include <asm/tlbflush.h>
  21. #include <hv/hypervisor.h>
  22. #include <arch/chip.h>
  23. #if !CHIP_HAS_COHERENT_LOCAL_CACHE()
  24. /* Defined in memcpy.S */
  25. extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
  26. extern unsigned long __copy_to_user_inatomic_asm(
  27. void __user *to, const void *from, unsigned long n);
  28. extern unsigned long __copy_from_user_inatomic_asm(
  29. void *to, const void __user *from, unsigned long n);
  30. extern unsigned long __copy_from_user_zeroing_asm(
  31. void *to, const void __user *from, unsigned long n);
  32. typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
  33. /* Size above which to consider TLB games for performance */
  34. #define LARGE_COPY_CUTOFF 2048
  35. /* Communicate to the simulator what we are trying to do. */
  36. #define sim_allow_multiple_caching(b) \
  37. __insn_mtspr(SPR_SIM_CONTROL, \
  38. SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
  39. /*
  40. * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
  41. *
  42. * We set up our own source and destination PTEs that we fully control.
  43. * This is the only way to guarantee that we don't race with another
  44. * thread that is modifying the PTE; we can't afford to try the
  45. * copy_{to,from}_user() technique of catching the interrupt, since
  46. * we must run with interrupts disabled to avoid the risk of some
  47. * other code seeing the incoherent data in our cache. (Recall that
  48. * our cache is indexed by PA, so even if the other code doesn't use
  49. * our kmap_atomic virtual addresses, they'll still hit in cache using
  50. * the normal VAs that aren't supposed to hit in cache.)
  51. */
  52. static void memcpy_multicache(void *dest, const void *source,
  53. pte_t dst_pte, pte_t src_pte, int len)
  54. {
  55. int idx;
  56. unsigned long flags, newsrc, newdst;
  57. pmd_t *pmdp;
  58. pte_t *ptep;
  59. int type0, type1;
  60. int cpu = get_cpu();
  61. /*
  62. * Disable interrupts so that we don't recurse into memcpy()
  63. * in an interrupt handler, nor accidentally reference
  64. * the PA of the source from an interrupt routine. Also
  65. * notify the simulator that we're playing games so we don't
  66. * generate spurious coherency warnings.
  67. */
  68. local_irq_save(flags);
  69. sim_allow_multiple_caching(1);
  70. /* Set up the new dest mapping */
  71. type0 = kmap_atomic_idx_push();
  72. idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
  73. newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
  74. pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
  75. ptep = pte_offset_kernel(pmdp, newdst);
  76. if (pte_val(*ptep) != pte_val(dst_pte)) {
  77. set_pte(ptep, dst_pte);
  78. local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
  79. }
  80. /* Set up the new source mapping */
  81. type1 = kmap_atomic_idx_push();
  82. idx += (type0 - type1);
  83. src_pte = hv_pte_set_nc(src_pte);
  84. src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
  85. newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
  86. pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
  87. ptep = pte_offset_kernel(pmdp, newsrc);
  88. __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
  89. local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
  90. /* Actually move the data. */
  91. __memcpy_asm((void *)newdst, (const void *)newsrc, len);
  92. /*
  93. * Remap the source as locally-cached and not OLOC'ed so that
  94. * we can inval without also invaling the remote cpu's cache.
  95. * This also avoids known errata with inv'ing cacheable oloc data.
  96. */
  97. src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
  98. src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
  99. __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
  100. local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
  101. /*
  102. * Do the actual invalidation, covering the full L2 cache line
  103. * at the end since __memcpy_asm() is somewhat aggressive.
  104. */
  105. __inv_buffer((void *)newsrc, len);
  106. /*
  107. * We're done: notify the simulator that all is back to normal,
  108. * and re-enable interrupts and pre-emption.
  109. */
  110. kmap_atomic_idx_pop();
  111. kmap_atomic_idx_pop();
  112. sim_allow_multiple_caching(0);
  113. local_irq_restore(flags);
  114. put_cpu();
  115. }
  116. /*
  117. * Identify large copies from remotely-cached memory, and copy them
  118. * via memcpy_multicache() if they look good, otherwise fall back
  119. * to the particular kind of copying passed as the memcpy_t function.
  120. */
  121. static unsigned long fast_copy(void *dest, const void *source, int len,
  122. memcpy_t func)
  123. {
  124. /*
  125. * Check if it's big enough to bother with. We may end up doing a
  126. * small copy via TLB manipulation if we're near a page boundary,
  127. * but presumably we'll make it up when we hit the second page.
  128. */
  129. while (len >= LARGE_COPY_CUTOFF) {
  130. int copy_size, bytes_left_on_page;
  131. pte_t *src_ptep, *dst_ptep;
  132. pte_t src_pte, dst_pte;
  133. struct page *src_page, *dst_page;
  134. /* Is the source page oloc'ed to a remote cpu? */
  135. retry_source:
  136. src_ptep = virt_to_pte(current->mm, (unsigned long)source);
  137. if (src_ptep == NULL)
  138. break;
  139. src_pte = *src_ptep;
  140. if (!hv_pte_get_present(src_pte) ||
  141. !hv_pte_get_readable(src_pte) ||
  142. hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
  143. break;
  144. if (get_remote_cache_cpu(src_pte) == smp_processor_id())
  145. break;
  146. src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
  147. get_page(src_page);
  148. if (pte_val(src_pte) != pte_val(*src_ptep)) {
  149. put_page(src_page);
  150. goto retry_source;
  151. }
  152. if (pte_huge(src_pte)) {
  153. /* Adjust the PTE to correspond to a small page */
  154. int pfn = hv_pte_get_pfn(src_pte);
  155. pfn += (((unsigned long)source & (HPAGE_SIZE-1))
  156. >> PAGE_SHIFT);
  157. src_pte = pfn_pte(pfn, src_pte);
  158. src_pte = pte_mksmall(src_pte);
  159. }
  160. /* Is the destination page writable? */
  161. retry_dest:
  162. dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
  163. if (dst_ptep == NULL) {
  164. put_page(src_page);
  165. break;
  166. }
  167. dst_pte = *dst_ptep;
  168. if (!hv_pte_get_present(dst_pte) ||
  169. !hv_pte_get_writable(dst_pte)) {
  170. put_page(src_page);
  171. break;
  172. }
  173. dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
  174. if (dst_page == src_page) {
  175. /*
  176. * Source and dest are on the same page; this
  177. * potentially exposes us to incoherence if any
  178. * part of src and dest overlap on a cache line.
  179. * Just give up rather than trying to be precise.
  180. */
  181. put_page(src_page);
  182. break;
  183. }
  184. get_page(dst_page);
  185. if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
  186. put_page(dst_page);
  187. goto retry_dest;
  188. }
  189. if (pte_huge(dst_pte)) {
  190. /* Adjust the PTE to correspond to a small page */
  191. int pfn = hv_pte_get_pfn(dst_pte);
  192. pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
  193. >> PAGE_SHIFT);
  194. dst_pte = pfn_pte(pfn, dst_pte);
  195. dst_pte = pte_mksmall(dst_pte);
  196. }
  197. /* All looks good: create a cachable PTE and copy from it */
  198. copy_size = len;
  199. bytes_left_on_page =
  200. PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
  201. if (copy_size > bytes_left_on_page)
  202. copy_size = bytes_left_on_page;
  203. bytes_left_on_page =
  204. PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
  205. if (copy_size > bytes_left_on_page)
  206. copy_size = bytes_left_on_page;
  207. memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
  208. /* Release the pages */
  209. put_page(dst_page);
  210. put_page(src_page);
  211. /* Continue on the next page */
  212. dest += copy_size;
  213. source += copy_size;
  214. len -= copy_size;
  215. }
  216. return func(dest, source, len);
  217. }
  218. void *memcpy(void *to, const void *from, __kernel_size_t n)
  219. {
  220. if (n < LARGE_COPY_CUTOFF)
  221. return (void *)__memcpy_asm(to, from, n);
  222. else
  223. return (void *)fast_copy(to, from, n, __memcpy_asm);
  224. }
  225. unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
  226. unsigned long n)
  227. {
  228. if (n < LARGE_COPY_CUTOFF)
  229. return __copy_to_user_inatomic_asm(to, from, n);
  230. else
  231. return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
  232. }
  233. unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
  234. unsigned long n)
  235. {
  236. if (n < LARGE_COPY_CUTOFF)
  237. return __copy_from_user_inatomic_asm(to, from, n);
  238. else
  239. return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
  240. }
  241. unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
  242. unsigned long n)
  243. {
  244. if (n < LARGE_COPY_CUTOFF)
  245. return __copy_from_user_zeroing_asm(to, from, n);
  246. else
  247. return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
  248. }
  249. #endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */