fastcopy.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. /*
  2. * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
  3. * Copyright (C) 2008-2009 PetaLogix
  4. * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
  5. *
  6. * This file is subject to the terms and conditions of the GNU General
  7. * Public License. See the file COPYING in the main directory of this
  8. * archive for more details.
  9. *
  10. * Written by Jim Law <jlaw@irispower.com>
  11. *
  12. * intended to replace:
  13. * memcpy in memcpy.c and
  14. * memmove in memmove.c
  15. * ... in arch/microblaze/lib
  16. *
  17. *
  18. * assly_fastcopy.S
  19. *
  20. * Attempt at quicker memcpy and memmove for MicroBlaze
  21. * Input : Operand1 in Reg r5 - destination address
  22. * Operand2 in Reg r6 - source address
  23. * Operand3 in Reg r7 - number of bytes to transfer
  24. * Output: Result in Reg r3 - starting destinaition address
  25. *
  26. *
  27. * Explanation:
  28. * Perform (possibly unaligned) copy of a block of memory
  29. * between mem locations with size of xfer spec'd in bytes
  30. */
  31. #ifdef __MICROBLAZEEL__
  32. #error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
  33. #endif
  34. #include <linux/linkage.h>
  35. .text
  36. .globl memcpy
  37. .type memcpy, @function
  38. .ent memcpy
  39. memcpy:
  40. fast_memcpy_ascending:
  41. /* move d to return register as value of function */
  42. addi r3, r5, 0
  43. addi r4, r0, 4 /* n = 4 */
  44. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  45. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  46. /* transfer first 0~3 bytes to get aligned dest address */
  47. andi r4, r5, 3 /* n = d & 3 */
  48. /* if zero, destination already aligned */
  49. beqi r4, a_dalign_done
  50. /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  51. rsubi r4, r4, 4
  52. rsub r7, r4, r7 /* c = c - n adjust c */
  53. a_xfer_first_loop:
  54. /* if no bytes left to transfer, transfer the bulk */
  55. beqi r4, a_dalign_done
  56. lbui r11, r6, 0 /* h = *s */
  57. sbi r11, r5, 0 /* *d = h */
  58. addi r6, r6, 1 /* s++ */
  59. addi r5, r5, 1 /* d++ */
  60. brid a_xfer_first_loop /* loop */
  61. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  62. a_dalign_done:
  63. addi r4, r0, 32 /* n = 32 */
  64. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  65. /* if n < 0, less than one block to transfer */
  66. blti r4, a_block_done
  67. a_block_xfer:
  68. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  69. rsub r7, r4, r7 /* c = c - n */
  70. andi r9, r6, 3 /* t1 = s & 3 */
  71. /* if temp != 0, unaligned transfers needed */
  72. bnei r9, a_block_unaligned
  73. a_block_aligned:
  74. lwi r9, r6, 0 /* t1 = *(s + 0) */
  75. lwi r10, r6, 4 /* t2 = *(s + 4) */
  76. lwi r11, r6, 8 /* t3 = *(s + 8) */
  77. lwi r12, r6, 12 /* t4 = *(s + 12) */
  78. swi r9, r5, 0 /* *(d + 0) = t1 */
  79. swi r10, r5, 4 /* *(d + 4) = t2 */
  80. swi r11, r5, 8 /* *(d + 8) = t3 */
  81. swi r12, r5, 12 /* *(d + 12) = t4 */
  82. lwi r9, r6, 16 /* t1 = *(s + 16) */
  83. lwi r10, r6, 20 /* t2 = *(s + 20) */
  84. lwi r11, r6, 24 /* t3 = *(s + 24) */
  85. lwi r12, r6, 28 /* t4 = *(s + 28) */
  86. swi r9, r5, 16 /* *(d + 16) = t1 */
  87. swi r10, r5, 20 /* *(d + 20) = t2 */
  88. swi r11, r5, 24 /* *(d + 24) = t3 */
  89. swi r12, r5, 28 /* *(d + 28) = t4 */
  90. addi r6, r6, 32 /* s = s + 32 */
  91. addi r4, r4, -32 /* n = n - 32 */
  92. bneid r4, a_block_aligned /* while (n) loop */
  93. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  94. bri a_block_done
  95. a_block_unaligned:
  96. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  97. add r6, r6, r4 /* s = s + n */
  98. lwi r11, r8, 0 /* h = *(as + 0) */
  99. addi r9, r9, -1
  100. beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
  101. addi r9, r9, -1
  102. beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
  103. a_block_u3:
  104. bslli r11, r11, 24 /* h = h << 24 */
  105. a_bu3_loop:
  106. lwi r12, r8, 4 /* v = *(as + 4) */
  107. bsrli r9, r12, 8 /* t1 = v >> 8 */
  108. or r9, r11, r9 /* t1 = h | t1 */
  109. swi r9, r5, 0 /* *(d + 0) = t1 */
  110. bslli r11, r12, 24 /* h = v << 24 */
  111. lwi r12, r8, 8 /* v = *(as + 8) */
  112. bsrli r9, r12, 8 /* t1 = v >> 8 */
  113. or r9, r11, r9 /* t1 = h | t1 */
  114. swi r9, r5, 4 /* *(d + 4) = t1 */
  115. bslli r11, r12, 24 /* h = v << 24 */
  116. lwi r12, r8, 12 /* v = *(as + 12) */
  117. bsrli r9, r12, 8 /* t1 = v >> 8 */
  118. or r9, r11, r9 /* t1 = h | t1 */
  119. swi r9, r5, 8 /* *(d + 8) = t1 */
  120. bslli r11, r12, 24 /* h = v << 24 */
  121. lwi r12, r8, 16 /* v = *(as + 16) */
  122. bsrli r9, r12, 8 /* t1 = v >> 8 */
  123. or r9, r11, r9 /* t1 = h | t1 */
  124. swi r9, r5, 12 /* *(d + 12) = t1 */
  125. bslli r11, r12, 24 /* h = v << 24 */
  126. lwi r12, r8, 20 /* v = *(as + 20) */
  127. bsrli r9, r12, 8 /* t1 = v >> 8 */
  128. or r9, r11, r9 /* t1 = h | t1 */
  129. swi r9, r5, 16 /* *(d + 16) = t1 */
  130. bslli r11, r12, 24 /* h = v << 24 */
  131. lwi r12, r8, 24 /* v = *(as + 24) */
  132. bsrli r9, r12, 8 /* t1 = v >> 8 */
  133. or r9, r11, r9 /* t1 = h | t1 */
  134. swi r9, r5, 20 /* *(d + 20) = t1 */
  135. bslli r11, r12, 24 /* h = v << 24 */
  136. lwi r12, r8, 28 /* v = *(as + 28) */
  137. bsrli r9, r12, 8 /* t1 = v >> 8 */
  138. or r9, r11, r9 /* t1 = h | t1 */
  139. swi r9, r5, 24 /* *(d + 24) = t1 */
  140. bslli r11, r12, 24 /* h = v << 24 */
  141. lwi r12, r8, 32 /* v = *(as + 32) */
  142. bsrli r9, r12, 8 /* t1 = v >> 8 */
  143. or r9, r11, r9 /* t1 = h | t1 */
  144. swi r9, r5, 28 /* *(d + 28) = t1 */
  145. bslli r11, r12, 24 /* h = v << 24 */
  146. addi r8, r8, 32 /* as = as + 32 */
  147. addi r4, r4, -32 /* n = n - 32 */
  148. bneid r4, a_bu3_loop /* while (n) loop */
  149. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  150. bri a_block_done
  151. a_block_u1:
  152. bslli r11, r11, 8 /* h = h << 8 */
  153. a_bu1_loop:
  154. lwi r12, r8, 4 /* v = *(as + 4) */
  155. bsrli r9, r12, 24 /* t1 = v >> 24 */
  156. or r9, r11, r9 /* t1 = h | t1 */
  157. swi r9, r5, 0 /* *(d + 0) = t1 */
  158. bslli r11, r12, 8 /* h = v << 8 */
  159. lwi r12, r8, 8 /* v = *(as + 8) */
  160. bsrli r9, r12, 24 /* t1 = v >> 24 */
  161. or r9, r11, r9 /* t1 = h | t1 */
  162. swi r9, r5, 4 /* *(d + 4) = t1 */
  163. bslli r11, r12, 8 /* h = v << 8 */
  164. lwi r12, r8, 12 /* v = *(as + 12) */
  165. bsrli r9, r12, 24 /* t1 = v >> 24 */
  166. or r9, r11, r9 /* t1 = h | t1 */
  167. swi r9, r5, 8 /* *(d + 8) = t1 */
  168. bslli r11, r12, 8 /* h = v << 8 */
  169. lwi r12, r8, 16 /* v = *(as + 16) */
  170. bsrli r9, r12, 24 /* t1 = v >> 24 */
  171. or r9, r11, r9 /* t1 = h | t1 */
  172. swi r9, r5, 12 /* *(d + 12) = t1 */
  173. bslli r11, r12, 8 /* h = v << 8 */
  174. lwi r12, r8, 20 /* v = *(as + 20) */
  175. bsrli r9, r12, 24 /* t1 = v >> 24 */
  176. or r9, r11, r9 /* t1 = h | t1 */
  177. swi r9, r5, 16 /* *(d + 16) = t1 */
  178. bslli r11, r12, 8 /* h = v << 8 */
  179. lwi r12, r8, 24 /* v = *(as + 24) */
  180. bsrli r9, r12, 24 /* t1 = v >> 24 */
  181. or r9, r11, r9 /* t1 = h | t1 */
  182. swi r9, r5, 20 /* *(d + 20) = t1 */
  183. bslli r11, r12, 8 /* h = v << 8 */
  184. lwi r12, r8, 28 /* v = *(as + 28) */
  185. bsrli r9, r12, 24 /* t1 = v >> 24 */
  186. or r9, r11, r9 /* t1 = h | t1 */
  187. swi r9, r5, 24 /* *(d + 24) = t1 */
  188. bslli r11, r12, 8 /* h = v << 8 */
  189. lwi r12, r8, 32 /* v = *(as + 32) */
  190. bsrli r9, r12, 24 /* t1 = v >> 24 */
  191. or r9, r11, r9 /* t1 = h | t1 */
  192. swi r9, r5, 28 /* *(d + 28) = t1 */
  193. bslli r11, r12, 8 /* h = v << 8 */
  194. addi r8, r8, 32 /* as = as + 32 */
  195. addi r4, r4, -32 /* n = n - 32 */
  196. bneid r4, a_bu1_loop /* while (n) loop */
  197. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  198. bri a_block_done
  199. a_block_u2:
  200. bslli r11, r11, 16 /* h = h << 16 */
  201. a_bu2_loop:
  202. lwi r12, r8, 4 /* v = *(as + 4) */
  203. bsrli r9, r12, 16 /* t1 = v >> 16 */
  204. or r9, r11, r9 /* t1 = h | t1 */
  205. swi r9, r5, 0 /* *(d + 0) = t1 */
  206. bslli r11, r12, 16 /* h = v << 16 */
  207. lwi r12, r8, 8 /* v = *(as + 8) */
  208. bsrli r9, r12, 16 /* t1 = v >> 16 */
  209. or r9, r11, r9 /* t1 = h | t1 */
  210. swi r9, r5, 4 /* *(d + 4) = t1 */
  211. bslli r11, r12, 16 /* h = v << 16 */
  212. lwi r12, r8, 12 /* v = *(as + 12) */
  213. bsrli r9, r12, 16 /* t1 = v >> 16 */
  214. or r9, r11, r9 /* t1 = h | t1 */
  215. swi r9, r5, 8 /* *(d + 8) = t1 */
  216. bslli r11, r12, 16 /* h = v << 16 */
  217. lwi r12, r8, 16 /* v = *(as + 16) */
  218. bsrli r9, r12, 16 /* t1 = v >> 16 */
  219. or r9, r11, r9 /* t1 = h | t1 */
  220. swi r9, r5, 12 /* *(d + 12) = t1 */
  221. bslli r11, r12, 16 /* h = v << 16 */
  222. lwi r12, r8, 20 /* v = *(as + 20) */
  223. bsrli r9, r12, 16 /* t1 = v >> 16 */
  224. or r9, r11, r9 /* t1 = h | t1 */
  225. swi r9, r5, 16 /* *(d + 16) = t1 */
  226. bslli r11, r12, 16 /* h = v << 16 */
  227. lwi r12, r8, 24 /* v = *(as + 24) */
  228. bsrli r9, r12, 16 /* t1 = v >> 16 */
  229. or r9, r11, r9 /* t1 = h | t1 */
  230. swi r9, r5, 20 /* *(d + 20) = t1 */
  231. bslli r11, r12, 16 /* h = v << 16 */
  232. lwi r12, r8, 28 /* v = *(as + 28) */
  233. bsrli r9, r12, 16 /* t1 = v >> 16 */
  234. or r9, r11, r9 /* t1 = h | t1 */
  235. swi r9, r5, 24 /* *(d + 24) = t1 */
  236. bslli r11, r12, 16 /* h = v << 16 */
  237. lwi r12, r8, 32 /* v = *(as + 32) */
  238. bsrli r9, r12, 16 /* t1 = v >> 16 */
  239. or r9, r11, r9 /* t1 = h | t1 */
  240. swi r9, r5, 28 /* *(d + 28) = t1 */
  241. bslli r11, r12, 16 /* h = v << 16 */
  242. addi r8, r8, 32 /* as = as + 32 */
  243. addi r4, r4, -32 /* n = n - 32 */
  244. bneid r4, a_bu2_loop /* while (n) loop */
  245. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  246. a_block_done:
  247. addi r4, r0, 4 /* n = 4 */
  248. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  249. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  250. a_word_xfer:
  251. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  252. addi r10, r0, 0 /* offset = 0 */
  253. andi r9, r6, 3 /* t1 = s & 3 */
  254. /* if temp != 0, unaligned transfers needed */
  255. bnei r9, a_word_unaligned
  256. a_word_aligned:
  257. lw r9, r6, r10 /* t1 = *(s+offset) */
  258. sw r9, r5, r10 /* *(d+offset) = t1 */
  259. addi r4, r4,-4 /* n-- */
  260. bneid r4, a_word_aligned /* loop */
  261. addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
  262. bri a_word_done
  263. a_word_unaligned:
  264. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  265. lwi r11, r8, 0 /* h = *(as + 0) */
  266. addi r8, r8, 4 /* as = as + 4 */
  267. addi r9, r9, -1
  268. beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
  269. addi r9, r9, -1
  270. beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
  271. a_word_u3:
  272. bslli r11, r11, 24 /* h = h << 24 */
  273. a_wu3_loop:
  274. lw r12, r8, r10 /* v = *(as + offset) */
  275. bsrli r9, r12, 8 /* t1 = v >> 8 */
  276. or r9, r11, r9 /* t1 = h | t1 */
  277. sw r9, r5, r10 /* *(d + offset) = t1 */
  278. bslli r11, r12, 24 /* h = v << 24 */
  279. addi r4, r4,-4 /* n = n - 4 */
  280. bneid r4, a_wu3_loop /* while (n) loop */
  281. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  282. bri a_word_done
  283. a_word_u1:
  284. bslli r11, r11, 8 /* h = h << 8 */
  285. a_wu1_loop:
  286. lw r12, r8, r10 /* v = *(as + offset) */
  287. bsrli r9, r12, 24 /* t1 = v >> 24 */
  288. or r9, r11, r9 /* t1 = h | t1 */
  289. sw r9, r5, r10 /* *(d + offset) = t1 */
  290. bslli r11, r12, 8 /* h = v << 8 */
  291. addi r4, r4,-4 /* n = n - 4 */
  292. bneid r4, a_wu1_loop /* while (n) loop */
  293. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  294. bri a_word_done
  295. a_word_u2:
  296. bslli r11, r11, 16 /* h = h << 16 */
  297. a_wu2_loop:
  298. lw r12, r8, r10 /* v = *(as + offset) */
  299. bsrli r9, r12, 16 /* t1 = v >> 16 */
  300. or r9, r11, r9 /* t1 = h | t1 */
  301. sw r9, r5, r10 /* *(d + offset) = t1 */
  302. bslli r11, r12, 16 /* h = v << 16 */
  303. addi r4, r4,-4 /* n = n - 4 */
  304. bneid r4, a_wu2_loop /* while (n) loop */
  305. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  306. a_word_done:
  307. add r5, r5, r10 /* d = d + offset */
  308. add r6, r6, r10 /* s = s + offset */
  309. rsub r7, r10, r7 /* c = c - offset */
  310. a_xfer_end:
  311. a_xfer_end_loop:
  312. beqi r7, a_done /* while (c) */
  313. lbui r9, r6, 0 /* t1 = *s */
  314. addi r6, r6, 1 /* s++ */
  315. sbi r9, r5, 0 /* *d = t1 */
  316. addi r7, r7, -1 /* c-- */
  317. brid a_xfer_end_loop /* loop */
  318. addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
  319. a_done:
  320. rtsd r15, 8
  321. nop
  322. .size memcpy, . - memcpy
  323. .end memcpy
  324. /*----------------------------------------------------------------------------*/
  325. .globl memmove
  326. .type memmove, @function
  327. .ent memmove
  328. memmove:
  329. cmpu r4, r5, r6 /* n = s - d */
  330. bgei r4,fast_memcpy_ascending
  331. fast_memcpy_descending:
  332. /* move d to return register as value of function */
  333. addi r3, r5, 0
  334. add r5, r5, r7 /* d = d + c */
  335. add r6, r6, r7 /* s = s + c */
  336. addi r4, r0, 4 /* n = 4 */
  337. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  338. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  339. /* transfer first 0~3 bytes to get aligned dest address */
  340. andi r4, r5, 3 /* n = d & 3 */
  341. /* if zero, destination already aligned */
  342. beqi r4,d_dalign_done
  343. rsub r7, r4, r7 /* c = c - n adjust c */
  344. d_xfer_first_loop:
  345. /* if no bytes left to transfer, transfer the bulk */
  346. beqi r4,d_dalign_done
  347. addi r6, r6, -1 /* s-- */
  348. addi r5, r5, -1 /* d-- */
  349. lbui r11, r6, 0 /* h = *s */
  350. sbi r11, r5, 0 /* *d = h */
  351. brid d_xfer_first_loop /* loop */
  352. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  353. d_dalign_done:
  354. addi r4, r0, 32 /* n = 32 */
  355. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  356. /* if n < 0, less than one block to transfer */
  357. blti r4, d_block_done
  358. d_block_xfer:
  359. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  360. rsub r7, r4, r7 /* c = c - n */
  361. andi r9, r6, 3 /* t1 = s & 3 */
  362. /* if temp != 0, unaligned transfers needed */
  363. bnei r9, d_block_unaligned
  364. d_block_aligned:
  365. addi r6, r6, -32 /* s = s - 32 */
  366. addi r5, r5, -32 /* d = d - 32 */
  367. lwi r9, r6, 28 /* t1 = *(s + 28) */
  368. lwi r10, r6, 24 /* t2 = *(s + 24) */
  369. lwi r11, r6, 20 /* t3 = *(s + 20) */
  370. lwi r12, r6, 16 /* t4 = *(s + 16) */
  371. swi r9, r5, 28 /* *(d + 28) = t1 */
  372. swi r10, r5, 24 /* *(d + 24) = t2 */
  373. swi r11, r5, 20 /* *(d + 20) = t3 */
  374. swi r12, r5, 16 /* *(d + 16) = t4 */
  375. lwi r9, r6, 12 /* t1 = *(s + 12) */
  376. lwi r10, r6, 8 /* t2 = *(s + 8) */
  377. lwi r11, r6, 4 /* t3 = *(s + 4) */
  378. lwi r12, r6, 0 /* t4 = *(s + 0) */
  379. swi r9, r5, 12 /* *(d + 12) = t1 */
  380. swi r10, r5, 8 /* *(d + 8) = t2 */
  381. swi r11, r5, 4 /* *(d + 4) = t3 */
  382. addi r4, r4, -32 /* n = n - 32 */
  383. bneid r4, d_block_aligned /* while (n) loop */
  384. swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
  385. bri d_block_done
  386. d_block_unaligned:
  387. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  388. rsub r6, r4, r6 /* s = s - n */
  389. lwi r11, r8, 0 /* h = *(as + 0) */
  390. addi r9, r9, -1
  391. beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
  392. addi r9, r9, -1
  393. beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
  394. d_block_u3:
  395. bsrli r11, r11, 8 /* h = h >> 8 */
  396. d_bu3_loop:
  397. addi r8, r8, -32 /* as = as - 32 */
  398. addi r5, r5, -32 /* d = d - 32 */
  399. lwi r12, r8, 28 /* v = *(as + 28) */
  400. bslli r9, r12, 24 /* t1 = v << 24 */
  401. or r9, r11, r9 /* t1 = h | t1 */
  402. swi r9, r5, 28 /* *(d + 28) = t1 */
  403. bsrli r11, r12, 8 /* h = v >> 8 */
  404. lwi r12, r8, 24 /* v = *(as + 24) */
  405. bslli r9, r12, 24 /* t1 = v << 24 */
  406. or r9, r11, r9 /* t1 = h | t1 */
  407. swi r9, r5, 24 /* *(d + 24) = t1 */
  408. bsrli r11, r12, 8 /* h = v >> 8 */
  409. lwi r12, r8, 20 /* v = *(as + 20) */
  410. bslli r9, r12, 24 /* t1 = v << 24 */
  411. or r9, r11, r9 /* t1 = h | t1 */
  412. swi r9, r5, 20 /* *(d + 20) = t1 */
  413. bsrli r11, r12, 8 /* h = v >> 8 */
  414. lwi r12, r8, 16 /* v = *(as + 16) */
  415. bslli r9, r12, 24 /* t1 = v << 24 */
  416. or r9, r11, r9 /* t1 = h | t1 */
  417. swi r9, r5, 16 /* *(d + 16) = t1 */
  418. bsrli r11, r12, 8 /* h = v >> 8 */
  419. lwi r12, r8, 12 /* v = *(as + 12) */
  420. bslli r9, r12, 24 /* t1 = v << 24 */
  421. or r9, r11, r9 /* t1 = h | t1 */
  422. swi r9, r5, 12 /* *(d + 112) = t1 */
  423. bsrli r11, r12, 8 /* h = v >> 8 */
  424. lwi r12, r8, 8 /* v = *(as + 8) */
  425. bslli r9, r12, 24 /* t1 = v << 24 */
  426. or r9, r11, r9 /* t1 = h | t1 */
  427. swi r9, r5, 8 /* *(d + 8) = t1 */
  428. bsrli r11, r12, 8 /* h = v >> 8 */
  429. lwi r12, r8, 4 /* v = *(as + 4) */
  430. bslli r9, r12, 24 /* t1 = v << 24 */
  431. or r9, r11, r9 /* t1 = h | t1 */
  432. swi r9, r5, 4 /* *(d + 4) = t1 */
  433. bsrli r11, r12, 8 /* h = v >> 8 */
  434. lwi r12, r8, 0 /* v = *(as + 0) */
  435. bslli r9, r12, 24 /* t1 = v << 24 */
  436. or r9, r11, r9 /* t1 = h | t1 */
  437. swi r9, r5, 0 /* *(d + 0) = t1 */
  438. addi r4, r4, -32 /* n = n - 32 */
  439. bneid r4, d_bu3_loop /* while (n) loop */
  440. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  441. bri d_block_done
  442. d_block_u1:
  443. bsrli r11, r11, 24 /* h = h >> 24 */
  444. d_bu1_loop:
  445. addi r8, r8, -32 /* as = as - 32 */
  446. addi r5, r5, -32 /* d = d - 32 */
  447. lwi r12, r8, 28 /* v = *(as + 28) */
  448. bslli r9, r12, 8 /* t1 = v << 8 */
  449. or r9, r11, r9 /* t1 = h | t1 */
  450. swi r9, r5, 28 /* *(d + 28) = t1 */
  451. bsrli r11, r12, 24 /* h = v >> 24 */
  452. lwi r12, r8, 24 /* v = *(as + 24) */
  453. bslli r9, r12, 8 /* t1 = v << 8 */
  454. or r9, r11, r9 /* t1 = h | t1 */
  455. swi r9, r5, 24 /* *(d + 24) = t1 */
  456. bsrli r11, r12, 24 /* h = v >> 24 */
  457. lwi r12, r8, 20 /* v = *(as + 20) */
  458. bslli r9, r12, 8 /* t1 = v << 8 */
  459. or r9, r11, r9 /* t1 = h | t1 */
  460. swi r9, r5, 20 /* *(d + 20) = t1 */
  461. bsrli r11, r12, 24 /* h = v >> 24 */
  462. lwi r12, r8, 16 /* v = *(as + 16) */
  463. bslli r9, r12, 8 /* t1 = v << 8 */
  464. or r9, r11, r9 /* t1 = h | t1 */
  465. swi r9, r5, 16 /* *(d + 16) = t1 */
  466. bsrli r11, r12, 24 /* h = v >> 24 */
  467. lwi r12, r8, 12 /* v = *(as + 12) */
  468. bslli r9, r12, 8 /* t1 = v << 8 */
  469. or r9, r11, r9 /* t1 = h | t1 */
  470. swi r9, r5, 12 /* *(d + 112) = t1 */
  471. bsrli r11, r12, 24 /* h = v >> 24 */
  472. lwi r12, r8, 8 /* v = *(as + 8) */
  473. bslli r9, r12, 8 /* t1 = v << 8 */
  474. or r9, r11, r9 /* t1 = h | t1 */
  475. swi r9, r5, 8 /* *(d + 8) = t1 */
  476. bsrli r11, r12, 24 /* h = v >> 24 */
  477. lwi r12, r8, 4 /* v = *(as + 4) */
  478. bslli r9, r12, 8 /* t1 = v << 8 */
  479. or r9, r11, r9 /* t1 = h | t1 */
  480. swi r9, r5, 4 /* *(d + 4) = t1 */
  481. bsrli r11, r12, 24 /* h = v >> 24 */
  482. lwi r12, r8, 0 /* v = *(as + 0) */
  483. bslli r9, r12, 8 /* t1 = v << 8 */
  484. or r9, r11, r9 /* t1 = h | t1 */
  485. swi r9, r5, 0 /* *(d + 0) = t1 */
  486. addi r4, r4, -32 /* n = n - 32 */
  487. bneid r4, d_bu1_loop /* while (n) loop */
  488. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  489. bri d_block_done
  490. d_block_u2:
  491. bsrli r11, r11, 16 /* h = h >> 16 */
  492. d_bu2_loop:
  493. addi r8, r8, -32 /* as = as - 32 */
  494. addi r5, r5, -32 /* d = d - 32 */
  495. lwi r12, r8, 28 /* v = *(as + 28) */
  496. bslli r9, r12, 16 /* t1 = v << 16 */
  497. or r9, r11, r9 /* t1 = h | t1 */
  498. swi r9, r5, 28 /* *(d + 28) = t1 */
  499. bsrli r11, r12, 16 /* h = v >> 16 */
  500. lwi r12, r8, 24 /* v = *(as + 24) */
  501. bslli r9, r12, 16 /* t1 = v << 16 */
  502. or r9, r11, r9 /* t1 = h | t1 */
  503. swi r9, r5, 24 /* *(d + 24) = t1 */
  504. bsrli r11, r12, 16 /* h = v >> 16 */
  505. lwi r12, r8, 20 /* v = *(as + 20) */
  506. bslli r9, r12, 16 /* t1 = v << 16 */
  507. or r9, r11, r9 /* t1 = h | t1 */
  508. swi r9, r5, 20 /* *(d + 20) = t1 */
  509. bsrli r11, r12, 16 /* h = v >> 16 */
  510. lwi r12, r8, 16 /* v = *(as + 16) */
  511. bslli r9, r12, 16 /* t1 = v << 16 */
  512. or r9, r11, r9 /* t1 = h | t1 */
  513. swi r9, r5, 16 /* *(d + 16) = t1 */
  514. bsrli r11, r12, 16 /* h = v >> 16 */
  515. lwi r12, r8, 12 /* v = *(as + 12) */
  516. bslli r9, r12, 16 /* t1 = v << 16 */
  517. or r9, r11, r9 /* t1 = h | t1 */
  518. swi r9, r5, 12 /* *(d + 112) = t1 */
  519. bsrli r11, r12, 16 /* h = v >> 16 */
  520. lwi r12, r8, 8 /* v = *(as + 8) */
  521. bslli r9, r12, 16 /* t1 = v << 16 */
  522. or r9, r11, r9 /* t1 = h | t1 */
  523. swi r9, r5, 8 /* *(d + 8) = t1 */
  524. bsrli r11, r12, 16 /* h = v >> 16 */
  525. lwi r12, r8, 4 /* v = *(as + 4) */
  526. bslli r9, r12, 16 /* t1 = v << 16 */
  527. or r9, r11, r9 /* t1 = h | t1 */
  528. swi r9, r5, 4 /* *(d + 4) = t1 */
  529. bsrli r11, r12, 16 /* h = v >> 16 */
  530. lwi r12, r8, 0 /* v = *(as + 0) */
  531. bslli r9, r12, 16 /* t1 = v << 16 */
  532. or r9, r11, r9 /* t1 = h | t1 */
  533. swi r9, r5, 0 /* *(d + 0) = t1 */
  534. addi r4, r4, -32 /* n = n - 32 */
  535. bneid r4, d_bu2_loop /* while (n) loop */
  536. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  537. d_block_done:
  538. addi r4, r0, 4 /* n = 4 */
  539. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  540. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  541. d_word_xfer:
  542. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  543. rsub r5, r4, r5 /* d = d - n */
  544. rsub r6, r4, r6 /* s = s - n */
  545. rsub r7, r4, r7 /* c = c - n */
  546. andi r9, r6, 3 /* t1 = s & 3 */
  547. /* if temp != 0, unaligned transfers needed */
  548. bnei r9, d_word_unaligned
  549. d_word_aligned:
  550. addi r4, r4,-4 /* n-- */
  551. lw r9, r6, r4 /* t1 = *(s+n) */
  552. bneid r4, d_word_aligned /* loop */
  553. sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
  554. bri d_word_done
  555. d_word_unaligned:
  556. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  557. lw r11, r8, r4 /* h = *(as + n) */
  558. addi r9, r9, -1
  559. beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
  560. addi r9, r9, -1
  561. beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
  562. d_word_u3:
  563. bsrli r11, r11, 8 /* h = h >> 8 */
  564. d_wu3_loop:
  565. addi r4, r4,-4 /* n = n - 4 */
  566. lw r12, r8, r4 /* v = *(as + n) */
  567. bslli r9, r12, 24 /* t1 = v << 24 */
  568. or r9, r11, r9 /* t1 = h | t1 */
  569. sw r9, r5, r4 /* *(d + n) = t1 */
  570. bneid r4, d_wu3_loop /* while (n) loop */
  571. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  572. bri d_word_done
  573. d_word_u1:
  574. bsrli r11, r11, 24 /* h = h >> 24 */
  575. d_wu1_loop:
  576. addi r4, r4,-4 /* n = n - 4 */
  577. lw r12, r8, r4 /* v = *(as + n) */
  578. bslli r9, r12, 8 /* t1 = v << 8 */
  579. or r9, r11, r9 /* t1 = h | t1 */
  580. sw r9, r5, r4 /* *(d + n) = t1 */
  581. bneid r4, d_wu1_loop /* while (n) loop */
  582. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  583. bri d_word_done
  584. d_word_u2:
  585. bsrli r11, r11, 16 /* h = h >> 16 */
  586. d_wu2_loop:
  587. addi r4, r4,-4 /* n = n - 4 */
  588. lw r12, r8, r4 /* v = *(as + n) */
  589. bslli r9, r12, 16 /* t1 = v << 16 */
  590. or r9, r11, r9 /* t1 = h | t1 */
  591. sw r9, r5, r4 /* *(d + n) = t1 */
  592. bneid r4, d_wu2_loop /* while (n) loop */
  593. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  594. d_word_done:
  595. d_xfer_end:
  596. d_xfer_end_loop:
  597. beqi r7, a_done /* while (c) */
  598. addi r6, r6, -1 /* s-- */
  599. lbui r9, r6, 0 /* t1 = *s */
  600. addi r5, r5, -1 /* d-- */
  601. sbi r9, r5, 0 /* *d = t1 */
  602. brid d_xfer_end_loop /* loop */
  603. addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
  604. d_done:
  605. rtsd r15, 8
  606. nop
  607. .size memmove, . - memmove
  608. .end memmove