memcpy.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /*
  2. * Optimized memory copy routines.
  3. *
  4. * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2, or (at your option)
  9. * any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19. *
  20. * Portions derived from the GNU C Library
  21. * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  22. *
  23. * Several strategies are tried to try to get the best performance for various
  24. * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
  25. * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  26. * general registers. Unaligned copies are handled either by aligning the
  27. * destination and then using shift-and-write method, or in a few cases by
  28. * falling back to a byte-at-a-time copy.
  29. *
  30. * I chose to implement this in C because it is easier to maintain and debug,
  31. * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  32. * at the time of writing) is fairly optimal. Unfortunately some of the
  33. * semantics of the copy routine (exception handling) is difficult to express
  34. * in C, so we have to play some tricks to get it to work.
  35. *
  36. * All the loads and stores are done via explicit asm() code in order to use
  37. * the right space registers.
  38. *
  39. * Testing with various alignments and buffer sizes shows that this code is
  40. * often >10x faster than a simple byte-at-a-time copy, even for strangely
  41. * aligned operands. It is interesting to note that the glibc version
  42. * of memcpy (written in C) is actually quite fast already. This routine is
  43. * able to beat it by 30-40% for aligned copies because of the loop unrolling,
  44. * but in some cases the glibc version is still slightly faster. This lends
  45. * more credibility that gcc can generate very good code as long as we are
  46. * careful.
  47. *
  48. * TODO:
  49. * - cache prefetching needs more experimentation to get optimal settings
  50. * - try not to use the post-increment address modifiers; they create additional
  51. * interlocks
  52. * - replace byte-copy loops with stybs sequences
  53. */
  54. #ifdef __KERNEL__
  55. #include <linux/module.h>
  56. #include <linux/compiler.h>
  57. #include <asm/uaccess.h>
  58. #define s_space "%%sr1"
  59. #define d_space "%%sr2"
  60. #else
  61. #include "memcpy.h"
  62. #define s_space "%%sr0"
  63. #define d_space "%%sr0"
  64. #define pa_memcpy new2_copy
  65. #endif
  66. DECLARE_PER_CPU(struct exception_data, exception_data);
  67. #define preserve_branch(label) do { \
  68. volatile int dummy; \
  69. /* The following branch is never taken, it's just here to */ \
  70. /* prevent gcc from optimizing away our exception code. */ \
  71. if (unlikely(dummy != dummy)) \
  72. goto label; \
  73. } while (0)
  74. #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
  75. #define get_kernel_space() (0)
  76. #define MERGE(w0, sh_1, w1, sh_2) ({ \
  77. unsigned int _r; \
  78. asm volatile ( \
  79. "mtsar %3\n" \
  80. "shrpw %1, %2, %%sar, %0\n" \
  81. : "=r"(_r) \
  82. : "r"(w0), "r"(w1), "r"(sh_2) \
  83. ); \
  84. _r; \
  85. })
  86. #define THRESHOLD 16
  87. #ifdef DEBUG_MEMCPY
  88. #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
  89. #else
  90. #define DPRINTF(fmt, args...)
  91. #endif
  92. #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
  93. __asm__ __volatile__ ( \
  94. "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
  95. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  96. : _tt(_t), "+r"(_a) \
  97. : \
  98. : "r8")
  99. #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
  100. __asm__ __volatile__ ( \
  101. "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
  102. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  103. : "+r"(_a) \
  104. : _tt(_t) \
  105. : "r8")
  106. #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
  107. #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
  108. #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
  109. #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
  110. #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
  111. #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
  112. #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
  113. __asm__ __volatile__ ( \
  114. "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
  115. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  116. : _tt(_t) \
  117. : "r"(_a) \
  118. : "r8")
  119. #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
  120. __asm__ __volatile__ ( \
  121. "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
  122. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  123. : \
  124. : _tt(_t), "r"(_a) \
  125. : "r8")
  126. #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
  127. #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
  128. #ifdef CONFIG_PREFETCH
  129. static inline void prefetch_src(const void *addr)
  130. {
  131. __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
  132. }
  133. static inline void prefetch_dst(const void *addr)
  134. {
  135. __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
  136. }
  137. #else
  138. #define prefetch_src(addr) do { } while(0)
  139. #define prefetch_dst(addr) do { } while(0)
  140. #endif
  141. /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
  142. * per loop. This code is derived from glibc.
  143. */
  144. static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
  145. {
  146. /* gcc complains that a2 and a3 may be uninitialized, but actually
  147. * they cannot be. Initialize a2/a3 to shut gcc up.
  148. */
  149. register unsigned int a0, a1, a2 = 0, a3 = 0;
  150. int sh_1, sh_2;
  151. struct exception_data *d;
  152. /* prefetch_src((const void *)src); */
  153. /* Calculate how to shift a word read at the memory operation
  154. aligned srcp to make it aligned for copy. */
  155. sh_1 = 8 * (src % sizeof(unsigned int));
  156. sh_2 = 8 * sizeof(unsigned int) - sh_1;
  157. /* Make src aligned by rounding it down. */
  158. src &= -sizeof(unsigned int);
  159. switch (len % 4)
  160. {
  161. case 2:
  162. /* a1 = ((unsigned int *) src)[0];
  163. a2 = ((unsigned int *) src)[1]; */
  164. ldw(s_space, 0, src, a1, cda_ldw_exc);
  165. ldw(s_space, 4, src, a2, cda_ldw_exc);
  166. src -= 1 * sizeof(unsigned int);
  167. dst -= 3 * sizeof(unsigned int);
  168. len += 2;
  169. goto do1;
  170. case 3:
  171. /* a0 = ((unsigned int *) src)[0];
  172. a1 = ((unsigned int *) src)[1]; */
  173. ldw(s_space, 0, src, a0, cda_ldw_exc);
  174. ldw(s_space, 4, src, a1, cda_ldw_exc);
  175. src -= 0 * sizeof(unsigned int);
  176. dst -= 2 * sizeof(unsigned int);
  177. len += 1;
  178. goto do2;
  179. case 0:
  180. if (len == 0)
  181. return 0;
  182. /* a3 = ((unsigned int *) src)[0];
  183. a0 = ((unsigned int *) src)[1]; */
  184. ldw(s_space, 0, src, a3, cda_ldw_exc);
  185. ldw(s_space, 4, src, a0, cda_ldw_exc);
  186. src -=-1 * sizeof(unsigned int);
  187. dst -= 1 * sizeof(unsigned int);
  188. len += 0;
  189. goto do3;
  190. case 1:
  191. /* a2 = ((unsigned int *) src)[0];
  192. a3 = ((unsigned int *) src)[1]; */
  193. ldw(s_space, 0, src, a2, cda_ldw_exc);
  194. ldw(s_space, 4, src, a3, cda_ldw_exc);
  195. src -=-2 * sizeof(unsigned int);
  196. dst -= 0 * sizeof(unsigned int);
  197. len -= 1;
  198. if (len == 0)
  199. goto do0;
  200. goto do4; /* No-op. */
  201. }
  202. do
  203. {
  204. /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
  205. do4:
  206. /* a0 = ((unsigned int *) src)[0]; */
  207. ldw(s_space, 0, src, a0, cda_ldw_exc);
  208. /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
  209. stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
  210. do3:
  211. /* a1 = ((unsigned int *) src)[1]; */
  212. ldw(s_space, 4, src, a1, cda_ldw_exc);
  213. /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
  214. stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
  215. do2:
  216. /* a2 = ((unsigned int *) src)[2]; */
  217. ldw(s_space, 8, src, a2, cda_ldw_exc);
  218. /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
  219. stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
  220. do1:
  221. /* a3 = ((unsigned int *) src)[3]; */
  222. ldw(s_space, 12, src, a3, cda_ldw_exc);
  223. /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
  224. stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
  225. src += 4 * sizeof(unsigned int);
  226. dst += 4 * sizeof(unsigned int);
  227. len -= 4;
  228. }
  229. while (len != 0);
  230. do0:
  231. /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
  232. stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
  233. preserve_branch(handle_load_error);
  234. preserve_branch(handle_store_error);
  235. return 0;
  236. handle_load_error:
  237. __asm__ __volatile__ ("cda_ldw_exc:\n");
  238. d = &__get_cpu_var(exception_data);
  239. DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
  240. o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
  241. return o_len * 4 - d->fault_addr + o_src;
  242. handle_store_error:
  243. __asm__ __volatile__ ("cda_stw_exc:\n");
  244. d = &__get_cpu_var(exception_data);
  245. DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
  246. o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
  247. return o_len * 4 - d->fault_addr + o_dst;
  248. }
  249. /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
  250. static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
  251. {
  252. register unsigned long src, dst, t1, t2, t3;
  253. register unsigned char *pcs, *pcd;
  254. register unsigned int *pws, *pwd;
  255. register double *pds, *pdd;
  256. unsigned long ret = 0;
  257. unsigned long o_dst, o_src, o_len;
  258. struct exception_data *d;
  259. src = (unsigned long)srcp;
  260. dst = (unsigned long)dstp;
  261. pcs = (unsigned char *)srcp;
  262. pcd = (unsigned char *)dstp;
  263. o_dst = dst; o_src = src; o_len = len;
  264. /* prefetch_src((const void *)srcp); */
  265. if (len < THRESHOLD)
  266. goto byte_copy;
  267. /* Check alignment */
  268. t1 = (src ^ dst);
  269. if (unlikely(t1 & (sizeof(double)-1)))
  270. goto unaligned_copy;
  271. /* src and dst have same alignment. */
  272. /* Copy bytes till we are double-aligned. */
  273. t2 = src & (sizeof(double) - 1);
  274. if (unlikely(t2 != 0)) {
  275. t2 = sizeof(double) - t2;
  276. while (t2 && len) {
  277. /* *pcd++ = *pcs++; */
  278. ldbma(s_space, pcs, t3, pmc_load_exc);
  279. len--;
  280. stbma(d_space, t3, pcd, pmc_store_exc);
  281. t2--;
  282. }
  283. }
  284. pds = (double *)pcs;
  285. pdd = (double *)pcd;
  286. #if 0
  287. /* Copy 8 doubles at a time */
  288. while (len >= 8*sizeof(double)) {
  289. register double r1, r2, r3, r4, r5, r6, r7, r8;
  290. /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
  291. flddma(s_space, pds, r1, pmc_load_exc);
  292. flddma(s_space, pds, r2, pmc_load_exc);
  293. flddma(s_space, pds, r3, pmc_load_exc);
  294. flddma(s_space, pds, r4, pmc_load_exc);
  295. fstdma(d_space, r1, pdd, pmc_store_exc);
  296. fstdma(d_space, r2, pdd, pmc_store_exc);
  297. fstdma(d_space, r3, pdd, pmc_store_exc);
  298. fstdma(d_space, r4, pdd, pmc_store_exc);
  299. #if 0
  300. if (L1_CACHE_BYTES <= 32)
  301. prefetch_src((char *)pds + L1_CACHE_BYTES);
  302. #endif
  303. flddma(s_space, pds, r5, pmc_load_exc);
  304. flddma(s_space, pds, r6, pmc_load_exc);
  305. flddma(s_space, pds, r7, pmc_load_exc);
  306. flddma(s_space, pds, r8, pmc_load_exc);
  307. fstdma(d_space, r5, pdd, pmc_store_exc);
  308. fstdma(d_space, r6, pdd, pmc_store_exc);
  309. fstdma(d_space, r7, pdd, pmc_store_exc);
  310. fstdma(d_space, r8, pdd, pmc_store_exc);
  311. len -= 8*sizeof(double);
  312. }
  313. #endif
  314. pws = (unsigned int *)pds;
  315. pwd = (unsigned int *)pdd;
  316. word_copy:
  317. while (len >= 8*sizeof(unsigned int)) {
  318. register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
  319. /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
  320. ldwma(s_space, pws, r1, pmc_load_exc);
  321. ldwma(s_space, pws, r2, pmc_load_exc);
  322. ldwma(s_space, pws, r3, pmc_load_exc);
  323. ldwma(s_space, pws, r4, pmc_load_exc);
  324. stwma(d_space, r1, pwd, pmc_store_exc);
  325. stwma(d_space, r2, pwd, pmc_store_exc);
  326. stwma(d_space, r3, pwd, pmc_store_exc);
  327. stwma(d_space, r4, pwd, pmc_store_exc);
  328. ldwma(s_space, pws, r5, pmc_load_exc);
  329. ldwma(s_space, pws, r6, pmc_load_exc);
  330. ldwma(s_space, pws, r7, pmc_load_exc);
  331. ldwma(s_space, pws, r8, pmc_load_exc);
  332. stwma(d_space, r5, pwd, pmc_store_exc);
  333. stwma(d_space, r6, pwd, pmc_store_exc);
  334. stwma(d_space, r7, pwd, pmc_store_exc);
  335. stwma(d_space, r8, pwd, pmc_store_exc);
  336. len -= 8*sizeof(unsigned int);
  337. }
  338. while (len >= 4*sizeof(unsigned int)) {
  339. register unsigned int r1,r2,r3,r4;
  340. ldwma(s_space, pws, r1, pmc_load_exc);
  341. ldwma(s_space, pws, r2, pmc_load_exc);
  342. ldwma(s_space, pws, r3, pmc_load_exc);
  343. ldwma(s_space, pws, r4, pmc_load_exc);
  344. stwma(d_space, r1, pwd, pmc_store_exc);
  345. stwma(d_space, r2, pwd, pmc_store_exc);
  346. stwma(d_space, r3, pwd, pmc_store_exc);
  347. stwma(d_space, r4, pwd, pmc_store_exc);
  348. len -= 4*sizeof(unsigned int);
  349. }
  350. pcs = (unsigned char *)pws;
  351. pcd = (unsigned char *)pwd;
  352. byte_copy:
  353. while (len) {
  354. /* *pcd++ = *pcs++; */
  355. ldbma(s_space, pcs, t3, pmc_load_exc);
  356. stbma(d_space, t3, pcd, pmc_store_exc);
  357. len--;
  358. }
  359. return 0;
  360. unaligned_copy:
  361. /* possibly we are aligned on a word, but not on a double... */
  362. if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
  363. t2 = src & (sizeof(unsigned int) - 1);
  364. if (unlikely(t2 != 0)) {
  365. t2 = sizeof(unsigned int) - t2;
  366. while (t2) {
  367. /* *pcd++ = *pcs++; */
  368. ldbma(s_space, pcs, t3, pmc_load_exc);
  369. stbma(d_space, t3, pcd, pmc_store_exc);
  370. len--;
  371. t2--;
  372. }
  373. }
  374. pws = (unsigned int *)pcs;
  375. pwd = (unsigned int *)pcd;
  376. goto word_copy;
  377. }
  378. /* Align the destination. */
  379. if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
  380. t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
  381. while (t2) {
  382. /* *pcd++ = *pcs++; */
  383. ldbma(s_space, pcs, t3, pmc_load_exc);
  384. stbma(d_space, t3, pcd, pmc_store_exc);
  385. len--;
  386. t2--;
  387. }
  388. dst = (unsigned long)pcd;
  389. src = (unsigned long)pcs;
  390. }
  391. ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
  392. o_dst, o_src, o_len);
  393. if (ret)
  394. return ret;
  395. pcs += (len & -sizeof(unsigned int));
  396. pcd += (len & -sizeof(unsigned int));
  397. len %= sizeof(unsigned int);
  398. preserve_branch(handle_load_error);
  399. preserve_branch(handle_store_error);
  400. goto byte_copy;
  401. handle_load_error:
  402. __asm__ __volatile__ ("pmc_load_exc:\n");
  403. d = &__get_cpu_var(exception_data);
  404. DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
  405. o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
  406. return o_len - d->fault_addr + o_src;
  407. handle_store_error:
  408. __asm__ __volatile__ ("pmc_store_exc:\n");
  409. d = &__get_cpu_var(exception_data);
  410. DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
  411. o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
  412. return o_len - d->fault_addr + o_dst;
  413. }
  414. #ifdef __KERNEL__
  415. unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
  416. {
  417. mtsp(get_kernel_space(), 1);
  418. mtsp(get_user_space(), 2);
  419. return pa_memcpy((void __force *)dst, src, len);
  420. }
  421. EXPORT_SYMBOL(__copy_from_user);
  422. unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
  423. {
  424. mtsp(get_user_space(), 1);
  425. mtsp(get_kernel_space(), 2);
  426. return pa_memcpy(dst, (void __force *)src, len);
  427. }
  428. unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
  429. {
  430. mtsp(get_user_space(), 1);
  431. mtsp(get_user_space(), 2);
  432. return pa_memcpy((void __force *)dst, (void __force *)src, len);
  433. }
  434. void * memcpy(void * dst,const void *src, size_t count)
  435. {
  436. mtsp(get_kernel_space(), 1);
  437. mtsp(get_kernel_space(), 2);
  438. pa_memcpy(dst, src, count);
  439. return dst;
  440. }
  441. EXPORT_SYMBOL(copy_to_user);
  442. EXPORT_SYMBOL(copy_from_user);
  443. EXPORT_SYMBOL(copy_in_user);
  444. EXPORT_SYMBOL(memcpy);
  445. #endif