memcpy_64.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/errno.h>
  4. #include <asm/cpufeatures.h>
  5. #include <asm/alternative-asm.h>
  6. #include <asm/export.h>
  7. /*
  8. * We build a jump to memcpy_orig by default which gets NOPped out on
  9. * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  10. * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  11. * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  12. */
  13. .weak memcpy
  14. /*
  15. * memcpy - Copy a memory block.
  16. *
  17. * Input:
  18. * rdi destination
  19. * rsi source
  20. * rdx count
  21. *
  22. * Output:
  23. * rax original destination
  24. */
  25. ENTRY(__memcpy)
  26. ENTRY(memcpy)
  27. ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  28. "jmp memcpy_erms", X86_FEATURE_ERMS
  29. movq %rdi, %rax
  30. movq %rdx, %rcx
  31. shrq $3, %rcx
  32. andl $7, %edx
  33. rep movsq
  34. movl %edx, %ecx
  35. rep movsb
  36. ret
  37. ENDPROC(memcpy)
  38. ENDPROC(__memcpy)
  39. EXPORT_SYMBOL(memcpy)
  40. EXPORT_SYMBOL(__memcpy)
  41. /*
  42. * memcpy_erms() - enhanced fast string memcpy. This is faster and
  43. * simpler than memcpy. Use memcpy_erms when possible.
  44. */
  45. ENTRY(memcpy_erms)
  46. movq %rdi, %rax
  47. movq %rdx, %rcx
  48. rep movsb
  49. ret
  50. ENDPROC(memcpy_erms)
  51. ENTRY(memcpy_orig)
  52. movq %rdi, %rax
  53. cmpq $0x20, %rdx
  54. jb .Lhandle_tail
  55. /*
  56. * We check whether memory false dependence could occur,
  57. * then jump to corresponding copy mode.
  58. */
  59. cmp %dil, %sil
  60. jl .Lcopy_backward
  61. subq $0x20, %rdx
  62. .Lcopy_forward_loop:
  63. subq $0x20, %rdx
  64. /*
  65. * Move in blocks of 4x8 bytes:
  66. */
  67. movq 0*8(%rsi), %r8
  68. movq 1*8(%rsi), %r9
  69. movq 2*8(%rsi), %r10
  70. movq 3*8(%rsi), %r11
  71. leaq 4*8(%rsi), %rsi
  72. movq %r8, 0*8(%rdi)
  73. movq %r9, 1*8(%rdi)
  74. movq %r10, 2*8(%rdi)
  75. movq %r11, 3*8(%rdi)
  76. leaq 4*8(%rdi), %rdi
  77. jae .Lcopy_forward_loop
  78. addl $0x20, %edx
  79. jmp .Lhandle_tail
  80. .Lcopy_backward:
  81. /*
  82. * Calculate copy position to tail.
  83. */
  84. addq %rdx, %rsi
  85. addq %rdx, %rdi
  86. subq $0x20, %rdx
  87. /*
  88. * At most 3 ALU operations in one cycle,
  89. * so append NOPS in the same 16 bytes trunk.
  90. */
  91. .p2align 4
  92. .Lcopy_backward_loop:
  93. subq $0x20, %rdx
  94. movq -1*8(%rsi), %r8
  95. movq -2*8(%rsi), %r9
  96. movq -3*8(%rsi), %r10
  97. movq -4*8(%rsi), %r11
  98. leaq -4*8(%rsi), %rsi
  99. movq %r8, -1*8(%rdi)
  100. movq %r9, -2*8(%rdi)
  101. movq %r10, -3*8(%rdi)
  102. movq %r11, -4*8(%rdi)
  103. leaq -4*8(%rdi), %rdi
  104. jae .Lcopy_backward_loop
  105. /*
  106. * Calculate copy position to head.
  107. */
  108. addl $0x20, %edx
  109. subq %rdx, %rsi
  110. subq %rdx, %rdi
  111. .Lhandle_tail:
  112. cmpl $16, %edx
  113. jb .Lless_16bytes
  114. /*
  115. * Move data from 16 bytes to 31 bytes.
  116. */
  117. movq 0*8(%rsi), %r8
  118. movq 1*8(%rsi), %r9
  119. movq -2*8(%rsi, %rdx), %r10
  120. movq -1*8(%rsi, %rdx), %r11
  121. movq %r8, 0*8(%rdi)
  122. movq %r9, 1*8(%rdi)
  123. movq %r10, -2*8(%rdi, %rdx)
  124. movq %r11, -1*8(%rdi, %rdx)
  125. retq
  126. .p2align 4
  127. .Lless_16bytes:
  128. cmpl $8, %edx
  129. jb .Lless_8bytes
  130. /*
  131. * Move data from 8 bytes to 15 bytes.
  132. */
  133. movq 0*8(%rsi), %r8
  134. movq -1*8(%rsi, %rdx), %r9
  135. movq %r8, 0*8(%rdi)
  136. movq %r9, -1*8(%rdi, %rdx)
  137. retq
  138. .p2align 4
  139. .Lless_8bytes:
  140. cmpl $4, %edx
  141. jb .Lless_3bytes
  142. /*
  143. * Move data from 4 bytes to 7 bytes.
  144. */
  145. movl (%rsi), %ecx
  146. movl -4(%rsi, %rdx), %r8d
  147. movl %ecx, (%rdi)
  148. movl %r8d, -4(%rdi, %rdx)
  149. retq
  150. .p2align 4
  151. .Lless_3bytes:
  152. subl $1, %edx
  153. jb .Lend
  154. /*
  155. * Move data from 1 bytes to 3 bytes.
  156. */
  157. movzbl (%rsi), %ecx
  158. jz .Lstore_1byte
  159. movzbq 1(%rsi), %r8
  160. movzbq (%rsi, %rdx), %r9
  161. movb %r8b, 1(%rdi)
  162. movb %r9b, (%rdi, %rdx)
  163. .Lstore_1byte:
  164. movb %cl, (%rdi)
  165. .Lend:
  166. retq
  167. ENDPROC(memcpy_orig)
  168. #ifndef CONFIG_UML
  169. /*
  170. * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
  171. * Note that we only catch machine checks when reading the source addresses.
  172. * Writes to target are posted and don't generate machine checks.
  173. */
  174. ENTRY(memcpy_mcsafe_unrolled)
  175. cmpl $8, %edx
  176. /* Less than 8 bytes? Go to byte copy loop */
  177. jb .L_no_whole_words
  178. /* Check for bad alignment of source */
  179. testl $7, %esi
  180. /* Already aligned */
  181. jz .L_8byte_aligned
  182. /* Copy one byte at a time until source is 8-byte aligned */
  183. movl %esi, %ecx
  184. andl $7, %ecx
  185. subl $8, %ecx
  186. negl %ecx
  187. subl %ecx, %edx
  188. .L_copy_leading_bytes:
  189. movb (%rsi), %al
  190. movb %al, (%rdi)
  191. incq %rsi
  192. incq %rdi
  193. decl %ecx
  194. jnz .L_copy_leading_bytes
  195. .L_8byte_aligned:
  196. /* Figure out how many whole cache lines (64-bytes) to copy */
  197. movl %edx, %ecx
  198. andl $63, %edx
  199. shrl $6, %ecx
  200. jz .L_no_whole_cache_lines
  201. /* Loop copying whole cache lines */
  202. .L_cache_w0: movq (%rsi), %r8
  203. .L_cache_w1: movq 1*8(%rsi), %r9
  204. .L_cache_w2: movq 2*8(%rsi), %r10
  205. .L_cache_w3: movq 3*8(%rsi), %r11
  206. movq %r8, (%rdi)
  207. movq %r9, 1*8(%rdi)
  208. movq %r10, 2*8(%rdi)
  209. movq %r11, 3*8(%rdi)
  210. .L_cache_w4: movq 4*8(%rsi), %r8
  211. .L_cache_w5: movq 5*8(%rsi), %r9
  212. .L_cache_w6: movq 6*8(%rsi), %r10
  213. .L_cache_w7: movq 7*8(%rsi), %r11
  214. movq %r8, 4*8(%rdi)
  215. movq %r9, 5*8(%rdi)
  216. movq %r10, 6*8(%rdi)
  217. movq %r11, 7*8(%rdi)
  218. leaq 64(%rsi), %rsi
  219. leaq 64(%rdi), %rdi
  220. decl %ecx
  221. jnz .L_cache_w0
  222. /* Are there any trailing 8-byte words? */
  223. .L_no_whole_cache_lines:
  224. movl %edx, %ecx
  225. andl $7, %edx
  226. shrl $3, %ecx
  227. jz .L_no_whole_words
  228. /* Copy trailing words */
  229. .L_copy_trailing_words:
  230. movq (%rsi), %r8
  231. mov %r8, (%rdi)
  232. leaq 8(%rsi), %rsi
  233. leaq 8(%rdi), %rdi
  234. decl %ecx
  235. jnz .L_copy_trailing_words
  236. /* Any trailing bytes? */
  237. .L_no_whole_words:
  238. andl %edx, %edx
  239. jz .L_done_memcpy_trap
  240. /* Copy trailing bytes */
  241. movl %edx, %ecx
  242. .L_copy_trailing_bytes:
  243. movb (%rsi), %al
  244. movb %al, (%rdi)
  245. incq %rsi
  246. incq %rdi
  247. decl %ecx
  248. jnz .L_copy_trailing_bytes
  249. /* Copy successful. Return zero */
  250. .L_done_memcpy_trap:
  251. xorq %rax, %rax
  252. ret
  253. ENDPROC(memcpy_mcsafe_unrolled)
  254. EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
  255. .section .fixup, "ax"
  256. /* Return -EFAULT for any failure */
  257. .L_memcpy_mcsafe_fail:
  258. mov $-EFAULT, %rax
  259. ret
  260. .previous
  261. _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
  262. _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
  263. _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
  264. _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
  265. _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
  266. _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
  267. _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
  268. _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
  269. _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
  270. _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
  271. _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
  272. #endif