memcopy.S 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. /*
  2. * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
  3. * xthal_memcpy and xthal_bcopy
  4. *
  5. * This file is subject to the terms and conditions of the GNU General Public
  6. * License. See the file "COPYING" in the main directory of this archive
  7. * for more details.
  8. *
  9. * Copyright (C) 2002 - 2005 Tensilica Inc.
  10. */
  11. #include <variant/core.h>
  12. .macro src_b r, w0, w1
  13. #ifdef __XTENSA_EB__
  14. src \r, \w0, \w1
  15. #else
  16. src \r, \w1, \w0
  17. #endif
  18. .endm
  19. .macro ssa8 r
  20. #ifdef __XTENSA_EB__
  21. ssa8b \r
  22. #else
  23. ssa8l \r
  24. #endif
  25. .endm
  26. /*
  27. * void *memcpy(void *dst, const void *src, size_t len);
  28. * void *memmove(void *dst, const void *src, size_t len);
  29. * void *bcopy(const void *src, void *dst, size_t len);
  30. *
  31. * This function is intended to do the same thing as the standard
  32. * library function memcpy() (or bcopy()) for most cases.
  33. * However, where the source and/or destination references
  34. * an instruction RAM or ROM or a data RAM or ROM, that
  35. * source and/or destination will always be accessed with
  36. * 32-bit load and store instructions (as required for these
  37. * types of devices).
  38. *
  39. * !!!!!!! XTFIXME:
  40. * !!!!!!! Handling of IRAM/IROM has not yet
  41. * !!!!!!! been implemented.
  42. *
  43. * The bcopy version is provided here to avoid the overhead
  44. * of an extra call, for callers that require this convention.
  45. *
  46. * The (general case) algorithm is as follows:
  47. * If destination is unaligned, align it by conditionally
  48. * copying 1 and 2 bytes.
  49. * If source is aligned,
  50. * do 16 bytes with a loop, and then finish up with
  51. * 8, 4, 2, and 1 byte copies conditional on the length;
  52. * else (if source is unaligned),
  53. * do the same, but use SRC to align the source data.
  54. * This code tries to use fall-through branches for the common
  55. * case of aligned source and destination and multiple
  56. * of 4 (or 8) length.
  57. *
  58. * Register use:
  59. * a0/ return address
  60. * a1/ stack pointer
  61. * a2/ return value
  62. * a3/ src
  63. * a4/ length
  64. * a5/ dst
  65. * a6/ tmp
  66. * a7/ tmp
  67. * a8/ tmp
  68. * a9/ tmp
  69. * a10/ tmp
  70. * a11/ tmp
  71. */
  72. .text
  73. .align 4
  74. .global bcopy
  75. .type bcopy,@function
  76. bcopy:
  77. entry sp, 16 # minimal stack frame
  78. # a2=src, a3=dst, a4=len
  79. mov a5, a3 # copy dst so that a2 is return value
  80. mov a3, a2
  81. mov a2, a5
  82. j .Lcommon # go to common code for memcpy+bcopy
  83. /*
  84. * Byte by byte copy
  85. */
  86. .align 4
  87. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  88. # (0 mod 4 alignment for LBEG)
  89. .Lbytecopy:
  90. #if XCHAL_HAVE_LOOPS
  91. loopnez a4, .Lbytecopydone
  92. #else /* !XCHAL_HAVE_LOOPS */
  93. beqz a4, .Lbytecopydone
  94. add a7, a3, a4 # a7 = end address for source
  95. #endif /* !XCHAL_HAVE_LOOPS */
  96. .Lnextbyte:
  97. l8ui a6, a3, 0
  98. addi a3, a3, 1
  99. s8i a6, a5, 0
  100. addi a5, a5, 1
  101. #if !XCHAL_HAVE_LOOPS
  102. blt a3, a7, .Lnextbyte
  103. #endif /* !XCHAL_HAVE_LOOPS */
  104. .Lbytecopydone:
  105. retw
  106. /*
  107. * Destination is unaligned
  108. */
  109. .align 4
  110. .Ldst1mod2: # dst is only byte aligned
  111. _bltui a4, 7, .Lbytecopy # do short copies byte by byte
  112. # copy 1 byte
  113. l8ui a6, a3, 0
  114. addi a3, a3, 1
  115. addi a4, a4, -1
  116. s8i a6, a5, 0
  117. addi a5, a5, 1
  118. _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
  119. # return to main algorithm
  120. .Ldst2mod4: # dst 16-bit aligned
  121. # copy 2 bytes
  122. _bltui a4, 6, .Lbytecopy # do short copies byte by byte
  123. l8ui a6, a3, 0
  124. l8ui a7, a3, 1
  125. addi a3, a3, 2
  126. addi a4, a4, -2
  127. s8i a6, a5, 0
  128. s8i a7, a5, 1
  129. addi a5, a5, 2
  130. j .Ldstaligned # dst is now aligned, return to main algorithm
  131. .align 4
  132. .global memcpy
  133. .type memcpy,@function
  134. memcpy:
  135. .global memmove
  136. .type memmove,@function
  137. memmove:
  138. entry sp, 16 # minimal stack frame
  139. # a2/ dst, a3/ src, a4/ len
  140. mov a5, a2 # copy dst so that a2 is return value
  141. .Lcommon:
  142. _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
  143. _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
  144. .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
  145. srli a7, a4, 4 # number of loop iterations with 16B
  146. # per iteration
  147. movi a8, 3 # if source is not aligned,
  148. _bany a3, a8, .Lsrcunaligned # then use shifting copy
  149. /*
  150. * Destination and source are word-aligned, use word copy.
  151. */
  152. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  153. #if XCHAL_HAVE_LOOPS
  154. loopnez a7, .Loop1done
  155. #else /* !XCHAL_HAVE_LOOPS */
  156. beqz a7, .Loop1done
  157. slli a8, a7, 4
  158. add a8, a8, a3 # a8 = end of last 16B source chunk
  159. #endif /* !XCHAL_HAVE_LOOPS */
  160. .Loop1:
  161. l32i a6, a3, 0
  162. l32i a7, a3, 4
  163. s32i a6, a5, 0
  164. l32i a6, a3, 8
  165. s32i a7, a5, 4
  166. l32i a7, a3, 12
  167. s32i a6, a5, 8
  168. addi a3, a3, 16
  169. s32i a7, a5, 12
  170. addi a5, a5, 16
  171. #if !XCHAL_HAVE_LOOPS
  172. blt a3, a8, .Loop1
  173. #endif /* !XCHAL_HAVE_LOOPS */
  174. .Loop1done:
  175. bbci.l a4, 3, .L2
  176. # copy 8 bytes
  177. l32i a6, a3, 0
  178. l32i a7, a3, 4
  179. addi a3, a3, 8
  180. s32i a6, a5, 0
  181. s32i a7, a5, 4
  182. addi a5, a5, 8
  183. .L2:
  184. bbsi.l a4, 2, .L3
  185. bbsi.l a4, 1, .L4
  186. bbsi.l a4, 0, .L5
  187. retw
  188. .L3:
  189. # copy 4 bytes
  190. l32i a6, a3, 0
  191. addi a3, a3, 4
  192. s32i a6, a5, 0
  193. addi a5, a5, 4
  194. bbsi.l a4, 1, .L4
  195. bbsi.l a4, 0, .L5
  196. retw
  197. .L4:
  198. # copy 2 bytes
  199. l16ui a6, a3, 0
  200. addi a3, a3, 2
  201. s16i a6, a5, 0
  202. addi a5, a5, 2
  203. bbsi.l a4, 0, .L5
  204. retw
  205. .L5:
  206. # copy 1 byte
  207. l8ui a6, a3, 0
  208. s8i a6, a5, 0
  209. retw
  210. /*
  211. * Destination is aligned, Source is unaligned
  212. */
  213. .align 4
  214. .Lsrcunaligned:
  215. _beqz a4, .Ldone # avoid loading anything for zero-length copies
  216. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  217. ssa8 a3 # set shift amount from byte offset
  218. #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the
  219. lint or ferret client, or 0 to save a few cycles */
  220. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  221. and a11, a3, a8 # save unalignment offset for below
  222. sub a3, a3, a11 # align a3
  223. #endif
  224. l32i a6, a3, 0 # load first word
  225. #if XCHAL_HAVE_LOOPS
  226. loopnez a7, .Loop2done
  227. #else /* !XCHAL_HAVE_LOOPS */
  228. beqz a7, .Loop2done
  229. slli a10, a7, 4
  230. add a10, a10, a3 # a10 = end of last 16B source chunk
  231. #endif /* !XCHAL_HAVE_LOOPS */
  232. .Loop2:
  233. l32i a7, a3, 4
  234. l32i a8, a3, 8
  235. src_b a6, a6, a7
  236. s32i a6, a5, 0
  237. l32i a9, a3, 12
  238. src_b a7, a7, a8
  239. s32i a7, a5, 4
  240. l32i a6, a3, 16
  241. src_b a8, a8, a9
  242. s32i a8, a5, 8
  243. addi a3, a3, 16
  244. src_b a9, a9, a6
  245. s32i a9, a5, 12
  246. addi a5, a5, 16
  247. #if !XCHAL_HAVE_LOOPS
  248. blt a3, a10, .Loop2
  249. #endif /* !XCHAL_HAVE_LOOPS */
  250. .Loop2done:
  251. bbci.l a4, 3, .L12
  252. # copy 8 bytes
  253. l32i a7, a3, 4
  254. l32i a8, a3, 8
  255. src_b a6, a6, a7
  256. s32i a6, a5, 0
  257. addi a3, a3, 8
  258. src_b a7, a7, a8
  259. s32i a7, a5, 4
  260. addi a5, a5, 8
  261. mov a6, a8
  262. .L12:
  263. bbci.l a4, 2, .L13
  264. # copy 4 bytes
  265. l32i a7, a3, 4
  266. addi a3, a3, 4
  267. src_b a6, a6, a7
  268. s32i a6, a5, 0
  269. addi a5, a5, 4
  270. mov a6, a7
  271. .L13:
  272. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  273. add a3, a3, a11 # readjust a3 with correct misalignment
  274. #endif
  275. bbsi.l a4, 1, .L14
  276. bbsi.l a4, 0, .L15
  277. .Ldone: retw
  278. .L14:
  279. # copy 2 bytes
  280. l8ui a6, a3, 0
  281. l8ui a7, a3, 1
  282. addi a3, a3, 2
  283. s8i a6, a5, 0
  284. s8i a7, a5, 1
  285. addi a5, a5, 2
  286. bbsi.l a4, 0, .L15
  287. retw
  288. .L15:
  289. # copy 1 byte
  290. l8ui a6, a3, 0
  291. s8i a6, a5, 0
  292. retw
  293. /*
  294. * Local Variables:
  295. * mode:fundamental
  296. * comment-start: "# "
  297. * comment-start-skip: "# *"
  298. * End:
  299. */